From: Étienne Mollier Date: Sun, 11 Jun 2023 12:43:31 +0000 (+0200) Subject: New upstream version 0.21.0+ds X-Git-Tag: archive/raspbian/0.22.0+ds-1+rpi1^2^2~17^2 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=5486b17301f333bb9baa96f655598fcd64038e34;p=python-pysam.git New upstream version 0.21.0+ds --- diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..d8c6f97 --- /dev/null +++ b/.python-version @@ -0,0 +1,2 @@ +3.6 +3.11 diff --git a/NEWS b/NEWS index a0078cb..e0b77a9 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,28 @@ http://pysam.readthedocs.io/en/latest/release.html Release notes ============= +Release 0.21.0 +============== + +This release wraps htslib/samtools/bcftools version 1.17. + +Pysam is now compatible with Python 3.11. We have removed python 2.x +support. Pysam is tested with python versions 3.6 to 3.11. + +* [#1175] VariantHeader.new_record: set start/stop before alleles +* [#1173] Add multiple build improvements in htscodecs on multi-arch macOS +* [#1148] Ignore CIGAR-less reads in find_introns. +* [#1172] Add new `samtools cram-size` and `samtools reset` commands +* [#1169] Fix CRAM index-related crash when using the musl C standard library. +* [#1168] Add a minimal pyproject.toml for PEP517. +* [#1158] Fix type hints and add FastqProxy type hints. +* [#1147] Py3.11 compatibility, get shared object suffix from EXT_SUFFIX. +* [#1143] Add mypy symbols for samtools and bcftools. +* [#1155] Fix pysam.index() when using recent `samtools index` options. +* [#1151] Test suite py3.11 compatibility, work around failing test case. +* [#1149] MacOS universal build compatibility. +* [#1146] Fix build when CFLAGS/etc environment variables are set. + Release 0.20.0 ============== diff --git a/README.rst b/README.rst index 9a66049..4f19003 100644 --- a/README.rst +++ b/README.rst @@ -25,7 +25,7 @@ as it resolves non-python dependencies and uses pre-configured compilation options. Especially for OS X this will potentially save a lot of trouble. -The current version of pysam wraps 3rd-party code from htslib-1.16, samtools-1.16.1, and bcftools-1.16. +The current version of pysam wraps 3rd-party code from htslib-1.17, samtools-1.17, and bcftools-1.17. Pysam is available through `pypi `_. To install, type:: diff --git a/bcftools/LICENSE b/bcftools/LICENSE index f223b09..6d40ae2 100644 --- a/bcftools/LICENSE +++ b/bcftools/LICENSE @@ -9,7 +9,7 @@ the INSTALL document), the use of this software is governed by the GPL license. The MIT/Expat License -Copyright (C) 2012-2021 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/bcftools/abuf.c b/bcftools/abuf.c index 78682d6..7958cf5 100644 --- a/bcftools/abuf.c +++ b/bcftools/abuf.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2021-2022 Genome Research Ltd. + Copyright (c) 2021-2023 Genome Research Ltd. Author: Petr Danecek @@ -154,8 +154,19 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) assert(atom); if ( altb!='-' ) kputc(altb, &atom->alt); if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; } + continue; } - else + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + + if ( rlen!=alen && (i+1>=rlen || i+1>=alen) ) // the next base is an indel combined with SNV, e.g. C>GGG? { buf->natoms++; hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); @@ -163,13 +174,13 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) atom->ref.l = 0; atom->alt.l = 0; kputc(refb, &atom->ref); - kputc(altb, &atom->alt); + kputc(refb, &atom->alt); atom->beg = atom->end = i; atom->ial = ial; } continue; } - if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion? + if ( i+1>=rlen || i+1>=alen ) // is the next base an indel? { buf->natoms++; hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); @@ -742,6 +753,8 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) _split_table_overlap(buf, j, atom); } } + // _split_table_print(buf); + // _split_table_print_atoms(buf); assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode // Create the output records, transferring all annotations: diff --git a/bcftools/abuf.c.pysam.c b/bcftools/abuf.c.pysam.c index d85a54c..6ac6d18 100644 --- a/bcftools/abuf.c.pysam.c +++ b/bcftools/abuf.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2021-2022 Genome Research Ltd. + Copyright (c) 2021-2023 Genome Research Ltd. Author: Petr Danecek @@ -156,8 +156,19 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) assert(atom); if ( altb!='-' ) kputc(altb, &atom->alt); if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; } + continue; } - else + buf->natoms++; + hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); + atom = &buf->atoms[buf->natoms-1]; + atom->ref.l = 0; + atom->alt.l = 0; + kputc(refb, &atom->ref); + kputc(altb, &atom->alt); + atom->beg = atom->end = i; + atom->ial = ial; + + if ( rlen!=alen && (i+1>=rlen || i+1>=alen) ) // the next base is an indel combined with SNV, e.g. C>GGG? { buf->natoms++; hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); @@ -165,13 +176,13 @@ static void _atomize_allele(abuf_t *buf, bcf1_t *rec, int ial) atom->ref.l = 0; atom->alt.l = 0; kputc(refb, &atom->ref); - kputc(altb, &atom->alt); + kputc(refb, &atom->alt); atom->beg = atom->end = i; atom->ial = ial; } continue; } - if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion? + if ( i+1>=rlen || i+1>=alen ) // is the next base an indel? { buf->natoms++; hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms); @@ -744,6 +755,8 @@ void _abuf_split(abuf_t *buf, bcf1_t *rec) _split_table_overlap(buf, j, atom); } } + // _split_table_print(buf); + // _split_table_print_atoms(buf); assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode // Create the output records, transferring all annotations: diff --git a/bcftools/bam2bcf.c b/bcftools/bam2bcf.c index d373e99..88e25de 100644 --- a/bcftools/bam2bcf.c +++ b/bcftools/bam2bcf.c @@ -72,9 +72,11 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, return bca; } +void bcf_iaux_destroy(bcf_callaux_t *bca); void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; + bcf_iaux_destroy(bca); errmod_destroy(bca->e); if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); @@ -89,28 +91,44 @@ void bcf_call_destroy(bcf_callaux_t *bca) free(bca->bases); free(bca->inscns); free(bca); } -static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref) +static int get_aux_nm(const bam_pileup1_t *p, int32_t qpos, int is_ref) { - uint8_t *nm_tag = bam_aux_get(rec, "NM"); - if ( !nm_tag ) return -1; - int64_t nm = bam_aux2i(nm_tag); + int64_t nm; + const bam_pileup_cd *cd = &p->cd; - // Count indels as single events, not as the number of inserted/deleted - // bases (which is what NM does). Add soft clips as mismatches. - int i; - for (i=0; i < rec->core.n_cigar; i++) + if ( PLP_NM(cd) == -1 ) return -1; + if ( PLP_NM(cd) == PLP_NM_UNSET ) { - int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK; - if ( val==BAM_CSOFT_CLIP ) + // todo: make this localized to be useful for long reads as well + bam1_t *rec = p->b; + uint8_t *nm_tag = bam_aux_get(rec, "NM"); + if ( !nm_tag ) { - nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + PLP_NM(cd) = -1; + return -1; } - else if ( val==BAM_CINS || val==BAM_CDEL ) + nm = bam_aux2i(nm_tag); + + // Count indels as single events, not as the number of inserted/deleted + // bases (which is what NM does). Add soft clips as mismatches. + int i; + for (i=0; i < rec->core.n_cigar; i++) { - val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; - if ( val > 1 ) nm -= val - 1; + int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK; + if ( val==BAM_CSOFT_CLIP ) + { + nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + } + else if ( val==BAM_CINS || val==BAM_CDEL ) + { + val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + if ( val > 1 ) nm -= val - 1; + } } + PLP_NM(cd) = nm; } + else + nm = PLP_NM(cd); // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV // http://www.genome.org/cgi/doi/10.1101/gr.239756.118 @@ -207,6 +225,9 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) memset(bca->alt_scl, 0, 100*sizeof(int)); memset(bca->iref_scl, 0, 100*sizeof(int)); memset(bca->ialt_scl, 0, 100*sizeof(int)); + int i; + for (i=0; i<2; i++) bca->nnm[i] = 0; + for (i=0; i<2; i++) bca->nm[i] = 0; } /* @@ -253,18 +274,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t int ADF_ref_missed[4] = {0}; for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; - int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; - if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; + int b; // the base or indel type + int q; // the base or indel quality used to calculate PL + int seqQ; // used to cap the indel quality given the sequence context + int mapQ; // to cap the quality for low MQ reads + int baseQ; // used only for supporting INFO annotations + int is_diff; // is this base or indel type different from the reference + int min_dist; // distance from the end, used for tail distance bias + if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(&p->cd) ) r->SCR++; if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; - if (p->is_del && !is_indel) continue; + + // The meaning of the indel related variables: + // is_indel .. is this position currently tested for an indel + // p->is_del .. is the current base a deletion in this read (unrelated to the tested indel) + // p->indel .. is there an indel starting after this position (i.e. does this read have the tested indel) + if (p->is_del && !is_indel) continue; // not testing an indel and the read has a spanning deletion + + int inm = -1; + ++ori_depth; - if (is_indel) + if (is_indel) // testing an indel position { - b = p->aux>>16&0x3f; + b = p->aux>>16&0x3f; // indel type seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias + + if ( !bca->indels_v20 ) + { + /* + This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain + correct result on the provided test case even when this code is commented out, so this + may not be needed anymore. Leaving it in only for backward compatibility for now. + See mpileup-tests homdel-issue-1446 and CHM1_CHM13_2.45x-1-1701408 which work only when + this code is disabled. + */ + if (p->indel == 0 && (q < _n/2 || _n > 20)) { + // high quality indel calls without p->indel set aren't + // particularly indicative of being a good REF match either, + // at least not in low coverage. So require solid coverage + // before we start utilising such quals. + b = 0; + q = (int)bam_get_qual(p->b)[p->qpos]; + seqQ = (3*seqQ + 2*q)/8; + } + if (_n > 20 && seqQ > 40) seqQ = 40; + } + + is_diff = b ? 1 : 0; + if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) ) + { + inm = get_aux_nm(p,p->qpos,is_diff?0:1); + if ( inm>=0 ) + { + bca->nnm[is_diff]++; + bca->nm[is_diff] += inm; + } + } + if (q < bca->min_baseQ) { - if (!p->indel && b < 4) + if (!p->indel && b < 4) // not an indel read { if (bam_is_rev(p->b)) ADR_ref_missed[b]++; @@ -273,19 +341,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } continue; } - if (p->indel == 0 && (q < _n/2 || _n > 20)) { - // high quality indel calls without p->indel set aren't - // particularly indicative of being a good REF match either, - // at least not in low coverage. So require solid coverage - // before we start utilising such quals. - b = 0; - q = (int)bam_get_qual(p->b)[p->qpos]; - seqQ = (3*seqQ + 2*q)/8; - } - if (_n > 20 && seqQ > 40) seqQ = 40; baseQ = p->aux>>8&0xff; - - is_diff = (b != 0); } else { @@ -307,6 +363,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t baseQ = q; seqQ = 99; is_diff = (ref4 < 4 && b == ref4)? 0 : 1; + if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) ) + { + inm = get_aux_nm(p,p->qpos,is_diff?0:1); + if ( inm>=0 ) + { + bca->nnm[is_diff]++; + bca->nm[is_diff] += inm; + } + } } mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 if ( !mapQ ) r->mq0++; @@ -316,6 +381,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (q > 63) q = 63; if (q < 4) q = 4; // MQ=0 reads count as BQ=4 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; + //if (is_indel) fprintf(stderr,"xx:base,q,strand\t%d\t%d\t%d\n",b,q,bam_is_rev(p->b)?0:1); + // collect annotations if (b < 4) { @@ -343,19 +410,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if ( baseQ > 59 ) baseQ = 59; if ( mapQ > 59 ) mapQ = 59; int len, epos = 0, sc_len = 0, sc_dist = 0; - if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) ) + if ( bca->fmt_flag & (B2B_INFO_RPBZ|B2B_INFO_VDB|B2B_INFO_SCBZ) ) { int pos = get_position(p, &len, &sc_len, &sc_dist); - epos = (double)pos/(len+1) * bca->npos; - + epos = (double)pos/(len+1) * (bca->npos - 1); if (sc_len) { - sc_len = 15.0*sc_len / sc_dist; + sc_len = 15.0*sc_len / (sc_dist+1); if (sc_len > 99) sc_len = 99; } + assert( epos>=0 && eposnpos ); + assert( sc_len>=0 && sc_lennpos ); } int imq = mapQ * nqual_over_60; int ibq = baseQ * nqual_over_60; - int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1); if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; @@ -955,6 +1022,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int } // if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); + // fprintf(stderr,"sum_min=%f\n",sum_min); call->shift = (int)(sum_min + .499); } // combine annotations @@ -972,68 +1040,47 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly if ( !has_alt ) return 0; - calc_SegBias(calls, call); + if ( bca->fmt_flag & B2B_INFO_FS ) + { + double left,right,two; + call->strand_bias = kt_fisher_exact(call->anno[0], call->anno[1], call->anno[2], call->anno[3], &left, &right, &two); + } + if ( bca->fmt_flag & B2B_INFO_SGB ) calc_SegBias(calls, call); // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - if (bca->fmt_flag & B2B_INFO_ZSCORE) { - // U z-normalised as +/- number of standard deviations from mean. - if (call->ori_ref < 0) { // indel - if (bca->fmt_flag & B2B_INFO_RPB) - call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, - bca->npos, 0, 1); - call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, - bca->nqual,1,1); - if ( bca->fmt_flag & B2B_INFO_SCB ) - call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, - 100, 0,1); - } else { - if (bca->fmt_flag & B2B_INFO_RPB) - call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, - bca->npos, 0, 1); - call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, - bca->nqual,1,1); - call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, - bca->nqual,0,1); - call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, - bca->nqual,0,1); - if ( bca->fmt_flag & B2B_INFO_SCB ) - call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, - 100, 0,1); - } + // U z-normalised as +/- number of standard deviations from mean. + if (call->ori_ref < 0) { // indel + if ( bca->fmt_flag & B2B_INFO_RPBZ ) + call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, bca->npos, 0, 1); + if ( bca->fmt_flag & B2B_INFO_MQBZ ) + call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_SCBZ ) + call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, 100, 0,1); + } else { + if ( bca->fmt_flag & B2B_INFO_RPBZ ) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, bca->npos, 0, 1); + if ( bca->fmt_flag & B2B_INFO_MQBZ ) + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_BQBZ ) + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_MQSBZ ) + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_SCBZ ) + call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1); + } + if ( bca->fmt_flag & B2B_INFO_NMBZ ) call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1); - if ( bca->fmt_flag & B2B_FMT_NMBZ ) + if ( bca->fmt_flag & B2B_FMT_NMBZ ) + { + for (i=0; imwu_nm[i+1] = val!=HUGE_VAL ? val : 0; - } + float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1); + call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0; } - } else { - // Old method; U as probability between 0 and 1 - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, - bca->npos, 0, 0); - call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, - bca->nqual, 1, 0); - call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, - bca->nqual, 0, 0); - call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, - bca->nqual, 0, 0); } - -#if CDF_MWU_TESTS - // CDF version of MWU tests is not calculated by default - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); -#endif - if ( bca->fmt_flag & B2B_INFO_VDB ) call->vdb = calc_vdb(bca->alt_pos, bca->npos); @@ -1097,11 +1144,13 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, bc->tmp.l = 0; // INFO - if (bc->ori_ref < 0) + if ( bc->ori_ref < 0 ) { bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); - bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); - bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); + if ( fmt_flag&B2B_INFO_IDV ) + bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); + if ( fmt_flag&B2B_INFO_IMF ) + bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); } bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); if ( fmt_flag&B2B_INFO_ADF ) @@ -1126,46 +1175,37 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( has_alt ) { - if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); - if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - - if (bca->fmt_flag & B2B_INFO_ZSCORE) { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); - if ( bc->mwu_nm[0] != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1); - if ( bc->mwu_sc != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); - } else { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + if ( fmt_flag&B2B_INFO_MIN_PL_SUM ) + bcf_update_info_int32(hdr, rec, "MIN_PL_SUM", &bc->shift, 1); + if ( fmt_flag&B2B_INFO_VDB && bc->vdb != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); + if ( fmt_flag&B2B_INFO_SGB && bc->seg_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); + if ( fmt_flag&B2B_INFO_NM && (bca->nnm[0] || bca->nnm[1]) ) + { + for (i=0; i<2; i++) bc->nm[i] = bca->nnm[i] ? bca->nm[i]/bca->nnm[i] : 0; + bcf_update_info_float(hdr, rec, "NM", bc->nm, 2); } - if ( bc->strand_bias != HUGE_VAL ) + if ( fmt_flag&B2B_INFO_RPBZ && bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( fmt_flag&B2B_INFO_MQBZ && bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( fmt_flag&B2B_INFO_MQSBZ && bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( fmt_flag&B2B_INFO_BQBZ && bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( fmt_flag&B2B_INFO_NMBZ && bc->mwu_nm[0] != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1); + if ( fmt_flag&B2B_INFO_SCBZ && bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + if ( fmt_flag&B2B_INFO_FS && bc->strand_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); - -#if CDF_MWU_TESTS - if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); - if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); - if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); - if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); -#endif } tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; - bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); + if ( fmt_flag&B2B_INFO_MQ0F ) + bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); // FORMAT rec->n_sample = bc->n; diff --git a/bcftools/bam2bcf.c.pysam.c b/bcftools/bam2bcf.c.pysam.c index 24c4270..4a6fe4d 100644 --- a/bcftools/bam2bcf.c.pysam.c +++ b/bcftools/bam2bcf.c.pysam.c @@ -74,9 +74,11 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ, int max_baseQ, return bca; } +void bcf_iaux_destroy(bcf_callaux_t *bca); void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; + bcf_iaux_destroy(bca); errmod_destroy(bca->e); if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); @@ -91,28 +93,44 @@ void bcf_call_destroy(bcf_callaux_t *bca) free(bca->bases); free(bca->inscns); free(bca); } -static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref) +static int get_aux_nm(const bam_pileup1_t *p, int32_t qpos, int is_ref) { - uint8_t *nm_tag = bam_aux_get(rec, "NM"); - if ( !nm_tag ) return -1; - int64_t nm = bam_aux2i(nm_tag); + int64_t nm; + const bam_pileup_cd *cd = &p->cd; - // Count indels as single events, not as the number of inserted/deleted - // bases (which is what NM does). Add soft clips as mismatches. - int i; - for (i=0; i < rec->core.n_cigar; i++) + if ( PLP_NM(cd) == -1 ) return -1; + if ( PLP_NM(cd) == PLP_NM_UNSET ) { - int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK; - if ( val==BAM_CSOFT_CLIP ) + // todo: make this localized to be useful for long reads as well + bam1_t *rec = p->b; + uint8_t *nm_tag = bam_aux_get(rec, "NM"); + if ( !nm_tag ) { - nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + PLP_NM(cd) = -1; + return -1; } - else if ( val==BAM_CINS || val==BAM_CDEL ) + nm = bam_aux2i(nm_tag); + + // Count indels as single events, not as the number of inserted/deleted + // bases (which is what NM does). Add soft clips as mismatches. + int i; + for (i=0; i < rec->core.n_cigar; i++) { - val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; - if ( val > 1 ) nm -= val - 1; + int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK; + if ( val==BAM_CSOFT_CLIP ) + { + nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + } + else if ( val==BAM_CINS || val==BAM_CDEL ) + { + val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT; + if ( val > 1 ) nm -= val - 1; + } } + PLP_NM(cd) = nm; } + else + nm = PLP_NM(cd); // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV // http://www.genome.org/cgi/doi/10.1101/gr.239756.118 @@ -209,6 +227,9 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call) memset(bca->alt_scl, 0, 100*sizeof(int)); memset(bca->iref_scl, 0, 100*sizeof(int)); memset(bca->ialt_scl, 0, 100*sizeof(int)); + int i; + for (i=0; i<2; i++) bca->nnm[i] = 0; + for (i=0; i<2; i++) bca->nm[i] = 0; } /* @@ -255,18 +276,65 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t int ADF_ref_missed[4] = {0}; for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; - int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; - if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++; + int b; // the base or indel type + int q; // the base or indel quality used to calculate PL + int seqQ; // used to cap the indel quality given the sequence context + int mapQ; // to cap the quality for low MQ reads + int baseQ; // used only for supporting INFO annotations + int is_diff; // is this base or indel type different from the reference + int min_dist; // distance from the end, used for tail distance bias + if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(&p->cd) ) r->SCR++; if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; - if (p->is_del && !is_indel) continue; + + // The meaning of the indel related variables: + // is_indel .. is this position currently tested for an indel + // p->is_del .. is the current base a deletion in this read (unrelated to the tested indel) + // p->indel .. is there an indel starting after this position (i.e. does this read have the tested indel) + if (p->is_del && !is_indel) continue; // not testing an indel and the read has a spanning deletion + + int inm = -1; + ++ori_depth; - if (is_indel) + if (is_indel) // testing an indel position { - b = p->aux>>16&0x3f; + b = p->aux>>16&0x3f; // indel type seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias + + if ( !bca->indels_v20 ) + { + /* + This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain + correct result on the provided test case even when this code is commented out, so this + may not be needed anymore. Leaving it in only for backward compatibility for now. + See mpileup-tests homdel-issue-1446 and CHM1_CHM13_2.45x-1-1701408 which work only when + this code is disabled. + */ + if (p->indel == 0 && (q < _n/2 || _n > 20)) { + // high quality indel calls without p->indel set aren't + // particularly indicative of being a good REF match either, + // at least not in low coverage. So require solid coverage + // before we start utilising such quals. + b = 0; + q = (int)bam_get_qual(p->b)[p->qpos]; + seqQ = (3*seqQ + 2*q)/8; + } + if (_n > 20 && seqQ > 40) seqQ = 40; + } + + is_diff = b ? 1 : 0; + if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) ) + { + inm = get_aux_nm(p,p->qpos,is_diff?0:1); + if ( inm>=0 ) + { + bca->nnm[is_diff]++; + bca->nm[is_diff] += inm; + } + } + if (q < bca->min_baseQ) { - if (!p->indel && b < 4) + if (!p->indel && b < 4) // not an indel read { if (bam_is_rev(p->b)) ADR_ref_missed[b]++; @@ -275,19 +343,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } continue; } - if (p->indel == 0 && (q < _n/2 || _n > 20)) { - // high quality indel calls without p->indel set aren't - // particularly indicative of being a good REF match either, - // at least not in low coverage. So require solid coverage - // before we start utilising such quals. - b = 0; - q = (int)bam_get_qual(p->b)[p->qpos]; - seqQ = (3*seqQ + 2*q)/8; - } - if (_n > 20 && seqQ > 40) seqQ = 40; baseQ = p->aux>>8&0xff; - - is_diff = (b != 0); } else { @@ -309,6 +365,15 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t baseQ = q; seqQ = 99; is_diff = (ref4 < 4 && b == ref4)? 0 : 1; + if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) ) + { + inm = get_aux_nm(p,p->qpos,is_diff?0:1); + if ( inm>=0 ) + { + bca->nnm[is_diff]++; + bca->nm[is_diff] += inm; + } + } } mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 if ( !mapQ ) r->mq0++; @@ -318,6 +383,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (q > 63) q = 63; if (q < 4) q = 4; // MQ=0 reads count as BQ=4 bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b; + //if (is_indel) fprintf(bcftools_stderr,"xx:base,q,strand\t%d\t%d\t%d\n",b,q,bam_is_rev(p->b)?0:1); + // collect annotations if (b < 4) { @@ -345,19 +412,19 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if ( baseQ > 59 ) baseQ = 59; if ( mapQ > 59 ) mapQ = 59; int len, epos = 0, sc_len = 0, sc_dist = 0; - if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) ) + if ( bca->fmt_flag & (B2B_INFO_RPBZ|B2B_INFO_VDB|B2B_INFO_SCBZ) ) { int pos = get_position(p, &len, &sc_len, &sc_dist); - epos = (double)pos/(len+1) * bca->npos; - + epos = (double)pos/(len+1) * (bca->npos - 1); if (sc_len) { - sc_len = 15.0*sc_len / sc_dist; + sc_len = 15.0*sc_len / (sc_dist+1); if (sc_len > 99) sc_len = 99; } + assert( epos>=0 && eposnpos ); + assert( sc_len>=0 && sc_lennpos ); } int imq = mapQ * nqual_over_60; int ibq = baseQ * nqual_over_60; - int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1); if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++; @@ -957,6 +1024,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int } // if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); + // fprintf(bcftools_stderr,"sum_min=%f\n",sum_min); call->shift = (int)(sum_min + .499); } // combine annotations @@ -974,68 +1042,47 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly if ( !has_alt ) return 0; - calc_SegBias(calls, call); + if ( bca->fmt_flag & B2B_INFO_FS ) + { + double left,right,two; + call->strand_bias = kt_fisher_exact(call->anno[0], call->anno[1], call->anno[2], call->anno[3], &left, &right, &two); + } + if ( bca->fmt_flag & B2B_INFO_SGB ) calc_SegBias(calls, call); // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos); // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual); // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual); - if (bca->fmt_flag & B2B_INFO_ZSCORE) { - // U z-normalised as +/- number of standard deviations from mean. - if (call->ori_ref < 0) { // indel - if (bca->fmt_flag & B2B_INFO_RPB) - call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, - bca->npos, 0, 1); - call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, - bca->nqual,1,1); - if ( bca->fmt_flag & B2B_INFO_SCB ) - call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, - 100, 0,1); - } else { - if (bca->fmt_flag & B2B_INFO_RPB) - call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, - bca->npos, 0, 1); - call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, - bca->nqual,1,1); - call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, - bca->nqual,0,1); - call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, - bca->nqual,0,1); - if ( bca->fmt_flag & B2B_INFO_SCB ) - call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, - 100, 0,1); - } + // U z-normalised as +/- number of standard deviations from mean. + if (call->ori_ref < 0) { // indel + if ( bca->fmt_flag & B2B_INFO_RPBZ ) + call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, bca->npos, 0, 1); + if ( bca->fmt_flag & B2B_INFO_MQBZ ) + call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_SCBZ ) + call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, 100, 0,1); + } else { + if ( bca->fmt_flag & B2B_INFO_RPBZ ) + call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, bca->npos, 0, 1); + if ( bca->fmt_flag & B2B_INFO_MQBZ ) + call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, bca->nqual,1,1); + if ( bca->fmt_flag & B2B_INFO_BQBZ ) + call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_MQSBZ ) + call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, bca->nqual,0,1); + if ( bca->fmt_flag & B2B_INFO_SCBZ ) + call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1); + } + if ( bca->fmt_flag & B2B_INFO_NMBZ ) call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1); - if ( bca->fmt_flag & B2B_FMT_NMBZ ) + if ( bca->fmt_flag & B2B_FMT_NMBZ ) + { + for (i=0; imwu_nm[i+1] = val!=HUGE_VAL ? val : 0; - } + float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1); + call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0; } - } else { - // Old method; U as probability between 0 and 1 - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, - bca->npos, 0, 0); - call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, - bca->nqual, 1, 0); - call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, - bca->nqual, 0, 0); - call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, - bca->nqual, 0, 0); } - -#if CDF_MWU_TESTS - // CDF version of MWU tests is not calculated by default - if ( bca->fmt_flag & B2B_INFO_RPB ) - call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos); - call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual); - call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual); - call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual); -#endif - if ( bca->fmt_flag & B2B_INFO_VDB ) call->vdb = calc_vdb(bca->alt_pos, bca->npos); @@ -1099,11 +1146,13 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, bc->tmp.l = 0; // INFO - if (bc->ori_ref < 0) + if ( bc->ori_ref < 0 ) { bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1); - bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); - bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); + if ( fmt_flag&B2B_INFO_IDV ) + bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1); + if ( fmt_flag&B2B_INFO_IMF ) + bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1); } bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1); if ( fmt_flag&B2B_INFO_ADF ) @@ -1128,46 +1177,37 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, if ( has_alt ) { - if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); - if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); - - if (bca->fmt_flag & B2B_INFO_ZSCORE) { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); - if ( bc->mwu_nm[0] != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1); - if ( bc->mwu_sc != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); - } else { - if ( bc->mwu_pos != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1); - if ( bc->mwu_mq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1); - if ( bc->mwu_mqs != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1); - if ( bc->mwu_bq != HUGE_VAL ) - bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1); + if ( fmt_flag&B2B_INFO_MIN_PL_SUM ) + bcf_update_info_int32(hdr, rec, "MIN_PL_SUM", &bc->shift, 1); + if ( fmt_flag&B2B_INFO_VDB && bc->vdb != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1); + if ( fmt_flag&B2B_INFO_SGB && bc->seg_bias != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1); + if ( fmt_flag&B2B_INFO_NM && (bca->nnm[0] || bca->nnm[1]) ) + { + for (i=0; i<2; i++) bc->nm[i] = bca->nnm[i] ? bca->nm[i]/bca->nnm[i] : 0; + bcf_update_info_float(hdr, rec, "NM", bc->nm, 2); } - if ( bc->strand_bias != HUGE_VAL ) + if ( fmt_flag&B2B_INFO_RPBZ && bc->mwu_pos != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1); + if ( fmt_flag&B2B_INFO_MQBZ && bc->mwu_mq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1); + if ( fmt_flag&B2B_INFO_MQSBZ && bc->mwu_mqs != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1); + if ( fmt_flag&B2B_INFO_BQBZ && bc->mwu_bq != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1); + if ( fmt_flag&B2B_INFO_NMBZ && bc->mwu_nm[0] != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1); + if ( fmt_flag&B2B_INFO_SCBZ && bc->mwu_sc != HUGE_VAL ) + bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1); + if ( fmt_flag&B2B_INFO_FS && bc->strand_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1); - -#if CDF_MWU_TESTS - if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1); - if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1); - if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1); - if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1); -#endif } tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0; - bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); + if ( fmt_flag&B2B_INFO_MQ0F ) + bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1); // FORMAT rec->n_sample = bc->n; diff --git a/bcftools/bam2bcf.h b/bcftools/bam2bcf.h index c256b26..955c022 100644 --- a/bcftools/bam2bcf.h +++ b/bcftools/bam2bcf.h @@ -58,11 +58,21 @@ DEALINGS IN THE SOFTWARE. */ #define B2B_INFO_SCR (1<<12) #define B2B_FMT_SCR (1<<13) #define B2B_INFO_VDB (1<<14) -#define B2B_INFO_RPB (1<<15) -#define B2B_FMT_QS (1<<16) -#define B2B_INFO_SCB (1<<17) -#define B2B_FMT_NMBZ (1<<18) // per-sample NMBZ -#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised +#define B2B_FMT_QS (1<<15) +#define B2B_FMT_NMBZ (1<<16) // per-sample NMBZ +#define B2B_INFO_NMBZ (1<<17) +#define B2B_INFO_BQBZ (1<<18) +#define B2B_INFO_MQBZ (1<<19) +#define B2B_INFO_MQSBZ (1<<20) +#define B2B_INFO_RPBZ (1<<21) +#define B2B_INFO_SCBZ (1<<22) +#define B2B_INFO_SGB (1<<23) +#define B2B_INFO_MIN_PL_SUM (1<<24) +#define B2B_INFO_NM (1<<25) +#define B2B_INFO_MQ0F (1<<26) +#define B2B_INFO_IDV (1<<27) +#define B2B_INFO_IMF (1<<28) +#define B2B_INFO_FS (1<<29) #define B2B_MAX_ALLELES 5 #define B2B_N_NM 32 // number of NMBZ bins, i.e. max number of mismatches @@ -72,13 +82,30 @@ DEALINGS IN THE SOFTWARE. */ #define B2B_INC_AD 1 #define B2B_INC_AD0 2 -#define PLP_HAS_SOFT_CLIP(i) ((i)&1) -#define PLP_HAS_INDEL(i) ((i)&2) -#define PLP_SAMPLE_ID(i) ((i)>>2) -#define PLP_SET_SOFT_CLIP(i) ((i)|=1) -#define PLP_SET_INDEL(i) ((i)|=2) -#define PLP_SET_SAMPLE_ID(i,n) ((i)|=(n)<<2) +// Pileup "client data" for each read to cache per-read information +#define PLP_CD(x) ((plp_cd_t*)((x)->p)) +#define PLP_HAS_SOFT_CLIP(cd) (PLP_CD(cd)->i & 1) +#define PLP_HAS_INDEL(cd) (PLP_CD(cd)->i & 2) +#define PLP_IS_REALN(cd) (PLP_CD(cd)->i & 4) +#define PLP_SAMPLE_ID(cd) (PLP_CD(cd)->i >> 3) +#define PLP_QLEN(cd) (PLP_CD(cd)->qlen) +#define PLP_NM(cd) (PLP_CD(cd)->nm) +#define PLP_NM_UNSET -2 + +#define PLP_SET_SOFT_CLIP(cd) (PLP_CD(cd)->i |= 1) +#define PLP_SET_INDEL(cd) (PLP_CD(cd)->i |= 2) +#define PLP_SET_REALN(cd) (PLP_CD(cd)->i |= 4) +#define PLP_SET_SAMPLE_ID(cd,n) (PLP_CD(cd)->i |= (n)<<3) + +typedef struct +{ + int64_t i; // used to store sample id and flags for presence of soft-clip and indel + uint32_t qlen; // cached output of bam_cigar2qlen(), 0 if unset + int nm; // -2 PLP_NM_UNSET; -1 not available; >=0 NM value computed by get_aux_nm() +} +plp_cd_t; + typedef struct __bcf_callaux_t { int fmt_flag, ambig_reads; @@ -95,7 +122,7 @@ typedef struct __bcf_callaux_t { // for internal uses int max_bases; int indel_types[4]; // indel lengths - int indel_win_size; + int indel_win_size, indels_v20; int maxins, indelreg; int read_len; char *inscns; @@ -104,6 +131,10 @@ typedef struct __bcf_callaux_t { void *rghash; float indel_bias; // adjusts indel score threshold; lower => call more. int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm} + unsigned int nnm[2]; // number of nm observations + float nm[2]; // cumulative count of mismatches in ref and alt reads + void *iaux; // auxiliary structure for --indels-2.0 calling + char *chr; // current chromosome } bcf_callaux_t; // per-sample values @@ -134,22 +165,21 @@ typedef struct { bcf_hdr_t *bcf_hdr; int a[5]; // alleles: ref, alt, alt2, alt3 float qsum[B2B_MAX_ALLELES]; // INFO/QS tag - int n, n_alleles, shift, ori_ref, unseen; + int n, n_alleles, ori_ref, unseen; + int32_t shift; // shift is the sum of min_PL before normalization to 0 across all samples int n_supp; // number of supporting non-reference reads double anno[16]; unsigned int depth, ori_depth, mq0; int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS, *ref_nm, *alt_nm; uint8_t *fmt_arr; float vdb; // variant distance bias - float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm; -#if CDF_MWU_TESTS - float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf; -#endif + float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm, nm[2]; float seg_bias; float strand_bias; // phred-scaled fisher-exact test kstring_t tmp; } bcf_call_t; + #ifdef __cplusplus extern "C" { #endif @@ -162,8 +192,12 @@ extern "C" { int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); + int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); + int bcf_cgp_l_run(const char *ref, int pos); + int est_indelreg(int pos, const char *ref, int l, char *ins4); + #ifdef __cplusplus } #endif diff --git a/bcftools/bam2bcf_iaux.c b/bcftools/bam2bcf_iaux.c new file mode 100644 index 0000000..2e0add1 --- /dev/null +++ b/bcftools/bam2bcf_iaux.c @@ -0,0 +1,737 @@ +/* bam2bcf_iaux.c -- modified indel caller + + Copyright (C) 2022 Genome Research Ltd. + + Author: pd3@sanger, jkb + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "bam2bcf.h" +#include "read_consensus.h" +#include "cigar_state.h" + +#include +KSORT_INIT_STATIC_GENERIC(uint32_t) + +#ifndef DEBUG_ALN +#define DEBUG_ALN 0 +#endif + +#define MAX_TYPES 64 + +typedef struct +{ + int pos; // current position + char *chr; // current chromosome + int nsmpl; // number of samples + int *nplp; // per-sample number of reads + bam_pileup1_t **plp; // per-sample reads + bcf_callaux_t *bca; // auxiliary bam2bcf structure + const char *ref; // reference genome (ASCII) + uint32_t *uitmp; // temporary unsigned int array + char *inscns; // insertions consensus "ACGTN"[itype*max_ins_len+i] + int muitmp, minscns; // size of uitmp, inscns + int iref_type, ntypes, types[MAX_TYPES]; // indel types + int max_ins_len; // largest insertion + int left, right; // consensus sequence boundaries, 0-based fa ref coordinates + read_cns_t *rcns; // read consensus + cns_seq_t *cns_seq; // array of consensus sequences + int *cns_pos; // array of relative pos indexes within cns_seq sequences + uint8_t *ref_seq, *qry_seq; // reference and query sequence to align + int nref_seq, nqry_seq; // the allocated size of ref_seq and qry_seq + uint8_t *qual; + int nqual; + int *read_scores, // read scores for each indel type [ntypes*iread+itype] + mread_scores, + ref_qual[MAX_TYPES], // refseq quality at pos for each indel type in the context of homopolymer runs + sum_qual[MAX_TYPES]; // qual contributions to each indel type from all reads +} +indel_aux_t; + +#if DEBUG_ALN +static void debug_print_types(indel_aux_t *iaux) +{ + int i,j; + fprintf(stderr,"types at %s:%d ntypes=%d... ",iaux->chr,iaux->pos+1,iaux->ntypes); + for (i=0; intypes; i++) + { + fprintf(stderr," type%d=",i); + if ( iaux->types[i]<=0 ) + { + if ( i==iaux->iref_type ) fprintf(stderr,"%d(ref)",iaux->types[i]); + else fprintf(stderr,"%d",iaux->types[i]); + continue; + } + char *cns = &iaux->inscns[i*iaux->max_ins_len]; + for (j=0; jtypes[i]; j++) fprintf(stderr,"%c","ACGTN"[(int)cns[j]]); + } + fprintf(stderr,"\n"); +} +#else +#define debug_print_types(iaux) +#endif + +void bcf_iaux_destroy(bcf_callaux_t *bca) +{ + if ( !bca->iaux ) return; + indel_aux_t *iaux = (indel_aux_t*)bca->iaux; + free(iaux->uitmp); + free(iaux->inscns); + free(iaux->ref_seq); + free(iaux->qry_seq); + free(iaux->qual); + free(iaux->read_scores); + rcns_destroy(iaux->rcns); + free(iaux); +} + +static void iaux_init_sequence_context(indel_aux_t *iaux) +{ + // Calculate left and right boundary. The array types is sorted in ascending order, the first + // element is the largest deletion (if a deletion present) + iaux->left = iaux->pos > iaux->bca->indel_win_size ? iaux->pos - iaux->bca->indel_win_size : 0; + iaux->right = iaux->pos + iaux->bca->indel_win_size; + if ( iaux->types[0] < 0 ) iaux->right -= iaux->types[0]; // extend by the largest deletion length + + // In case the alignments stand out the reference + int i; + for (i=iaux->pos; iright; i++) + if ( !iaux->ref[i] ) break; + iaux->right = i; + + // Sequence quality in the context of homopolymers for each indel type + int l_run = bcf_cgp_l_run(iaux->ref, iaux->pos); // The length of the homopolymer run around the current position + for (i=0; intypes; i++) + { + int l = iaux->types[i]; + + // This is the original est_seqQ() code. FIXME: check if the inserted sequence is consistent with the homopolymer run + int q = iaux->bca->openQ + iaux->bca->extQ * (abs(l) - 1); + int qh = l_run >= 3? (int)(iaux->bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; + if ( q > qh ) q = qh; + + iaux->ref_qual[i] = q < 255 ? q : 255; + } + + // Determine the indel region, this makes the difference between e.g. T>TA vs TA>TAA + iaux->bca->indelreg = 0; + for (i=0; intypes; i++) + { + if ( !iaux->types[i] ) continue; + int ireg; + if ( iaux->types[i] > 0 ) + ireg = est_indelreg(iaux->pos, iaux->ref, iaux->types[i], &iaux->inscns[i*iaux->max_ins_len]); + else + ireg = est_indelreg(iaux->pos, iaux->ref, -iaux->types[i], 0); + if ( ireg > iaux->bca->indelreg ) iaux->bca->indelreg = ireg; + } +} + +static int iaux_init_scores(indel_aux_t *iaux, int ismpl) +{ + int n = iaux->nplp[ismpl] * iaux->ntypes; + if ( iaux->mread_scores < n ) + { + int *tmp = (int*) realloc(iaux->read_scores,n*sizeof(int)); + if ( !tmp ) return -1; + iaux->mread_scores = n; + iaux->read_scores = tmp; + } + memset(iaux->read_scores,0,n); + return 0; +} + +static int _have_indel_reads(indel_aux_t *iaux) +{ + int i,j; + for (i=0; insmpl; i++) + { + for (j=0; jnplp[i]; j++) + if ( iaux->plp[i][j].indel ) return 1; + } + return 0; +} + +// For insertions only their sizes were collected so far. Now go through the reads and +// create consensus sequence for each insert, therefore note that there can be only one +// sequence per insertion length +static int iaux_init_ins_types(indel_aux_t *iaux) +{ + if ( !iaux->max_ins_len ) return 0; + + uint32_t *aux; + int naux = 5 * iaux->ntypes * iaux->max_ins_len; + if ( iaux->muitmp < naux ) + { + aux = (uint32_t*) realloc(iaux->uitmp,naux*sizeof(*aux)); + if ( !aux ) return -1; + iaux->uitmp = aux; + iaux->muitmp = naux; + } + else aux = iaux->uitmp; + memset(aux,0,naux*sizeof(*aux)); + + // count the number of occurrences of each base at each position for each type of insertion + int t,s,i,j; + for (t=0; tntypes; t++) + { + if ( iaux->types[t] <= 0) continue; + for (s=0; snsmpl; s++) + { + for (i=0; inplp[s]; i++) + { + bam_pileup1_t *plp = iaux->plp[s] + i; + if ( plp->indel != iaux->types[t] ) continue; + uint8_t *seq = bam_get_seq(plp->b); + for (j=0; jindel; j++) + { + int c = seq_nt16_int[bam_seqi(seq, plp->qpos+j+1)]; + assert(c<5); + aux[5*(t*iaux->max_ins_len+j) + c]++; + } + } + } + } + + char *cns; + int ncns = iaux->ntypes * iaux->max_ins_len; + if ( iaux->minscns < ncns ) + { + cns = (char*) realloc(iaux->inscns,naux*sizeof(*aux)); + if ( !cns ) return -1; + iaux->inscns = cns; + iaux->minscns = ncns; + } + else cns = iaux->inscns; + memset(aux,0,ncns*sizeof(*cns)); + + // use the majority rule to construct the consensus + for (t=0; tntypes; t++) + { + for (i=0; itypes[t]; i++) // this naturally includes only insertions + { + uint32_t *tmp = &aux[5*(t*iaux->max_ins_len+i)], max = tmp[0], max_j = 0; + for (j=1; j<5; j++) + if ( max < tmp[j] ) max = tmp[j], max_j = j; + cns[t*iaux->max_ins_len + i] = max ? max_j : 4; + if ( max_j==4 ) { iaux->types[t] = 0; break; } // discard insertions which contain N's + } + } + return 0; +} + +#define MINUS_CONST 0x10000000 +static int iaux_init_types(indel_aux_t *iaux) +{ + if ( !_have_indel_reads(iaux) ) return 0; + + iaux->bca->max_support = 0; + memset(iaux->sum_qual,0,MAX_TYPES*sizeof(*iaux->sum_qual)); + + int i,j, nreads = 0; + for (i=0; insmpl; i++) nreads += iaux->nplp[i]; + + uint32_t *aux; + if ( iaux->muitmp < nreads+1 ) + { + aux = (uint32_t*) realloc(iaux->uitmp,(nreads+1)*sizeof(*iaux->uitmp)); + if ( !aux ) return -1; + iaux->uitmp = aux; + iaux->muitmp = nreads+1; + } + else aux = iaux->uitmp; + memset(aux,0,(nreads+1)*sizeof(*aux)); + + int naux = 0, indel_support_ok = 0, n_alt = 0, n_tot = 0; + int max_rd_len = 0; // max sequence length that includes ref+del bases + + // Fill out aux[] array with all the non-zero indel sizes. This is an unsorted list with as many + // entries as there are reads + aux[naux++] = MINUS_CONST; // zero indel is always a type (REF) + for (i=0; insmpl; i++) + { + int nalt = naux, ntot = 0; // per sample values + for (j=0; jnplp[i]; j++) + { + const bam_pileup1_t *plp = iaux->plp[i] + j; + ntot++; + if ( plp->indel ) aux[naux++] = MINUS_CONST + plp->indel; + if ( !PLP_QLEN(&plp->cd) ) PLP_QLEN(&plp->cd) = bam_cigar2qlen(plp->b->core.n_cigar, bam_get_cigar(plp->b)); + if ( PLP_QLEN(&plp->cd) > max_rd_len ) max_rd_len = PLP_QLEN(&plp->cd); + } + nalt = naux - nalt; + if ( iaux->bca->per_sample_flt ) + { + double frac = (double)nalt/naux; + if ( nalt >= iaux->bca->min_support && frac >= iaux->bca->min_frac ) indel_support_ok = 1; + if ( nalt > iaux->bca->max_support && frac > 0 ) iaux->bca->max_support = nalt, iaux->bca->max_frac = frac; + } + else + { + n_alt += nalt; + n_tot += ntot; + } + } + + // Check if the minimum required number of indel reads has been observed + if ( !iaux->bca->per_sample_flt && n_alt >= iaux->bca->min_support && (double)n_alt/n_tot >= iaux->bca->min_frac ) indel_support_ok = 1; + if ( naux==1 || !indel_support_ok ) return 0; + + // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), check the number of N's in the + // sequence and skip places where half or more reference bases in the sequence that follows pos are Ns + int nN = 0, i_end = iaux->pos + (iaux->bca->indel_win_size < max_rd_len ? iaux->bca->indel_win_size : max_rd_len); + for (i=iaux->pos; iref[i]; i++) + if ( iaux->ref[i] == 'N' ) nN++; + if ( 2*nN > i - iaux->pos ) return -1; + + // Sort aux[] and dedup indel types + int n_types = 1; + ks_introsort(uint32_t, naux, aux); + for (i=1; i= MAX_TYPES ) + { + static int warned = 0; + if ( !warned ) + { + fprintf(stderr, "Warning: excessive number of INDEL alleles at %s:%d, skipping. (This warning is printed only once)\n",iaux->chr,iaux->pos+1); + warned = 1; + } + return -1; + } + + // Fill out the types[] array detailing the size of insertion or deletion. + iaux->ntypes = 0; + iaux->max_ins_len = 0; + for (i=0; iiref_type = iaux->ntypes; + } + else + { + if ( j-i >= iaux->bca->min_support ) is_ok = 1; + // What is the best way to handle the -pmF options: + // - consider only sites where a single indel type passes the -mF threshold, as opposed to all indel types cumulatively + // - once a site passes, include all indel types in the evaluation, as opposed to considering only the strong candidates + // In this implementation sites are selected by counting reads from all indel types cumulatively and all indel types + // are considered. + // Uncomment the following condition to consider only strong indel candidates once the site has been selected + // if ( !iaux->bca->per_sample_flt && (double)(j-i) / n_tot < iaux->bca->min_frac ) is_ok = 0; + } + if ( is_ok ) + { + iaux->types[iaux->ntypes++] = isize; + if ( isize > 0 && isize > iaux->max_ins_len ) iaux->max_ins_len = isize; + } + i = j-1; + } + if ( iaux->ntypes <= 1 ) return 0; + + // Init insertion types, including their sequence + if ( iaux_init_ins_types(iaux) < 0 ) return -1; + + iaux_init_sequence_context(iaux); + + return iaux->ntypes; +} +#undef MINUS_CONST + +static int iaux_set_consensus(indel_aux_t *iaux, int ismpl) +{ + if ( !iaux->rcns ) + iaux->rcns = rcns_init(iaux->pos, iaux->left, iaux->right); + else + rcns_reset(iaux->rcns, iaux->pos, iaux->left, iaux->right); + + rcns_set_reads(iaux->rcns, iaux->plp[ismpl], iaux->nplp[ismpl]); + + iaux->cns_seq = rcns_get_consensus(iaux->rcns, iaux->ref + iaux->left); + +// todo: +// rcns should also collect localized number of mismatches as a substitute +// for uninformative MQ. This would not affect calling but would help with +// filtering + + return 0; +} + +#if 0 +// Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no +// such value, the largest index with value smaller than pos. Starts at initial guess ioff. +// This could use a binary search but the assumption is that the initial guess is indel-size close +// to the actuall coordinate. +// +// TODO: remove this function and seq_pos from cns creation as it seems unnecessary +static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff) +{ + if ( ioff<0 ) ioff = 0; + else if ( ioff >= nseq_pos ) ioff = nseq_pos - 1; + if ( seq_pos[ioff] < pos ) + { + while ( ioff+1 < nseq_pos && seq_pos[ioff] < pos ) ioff++; + if ( seq_pos[ioff] > pos ) ioff--; + return ioff; + } + while ( ioff > 0 && seq_pos[ioff-1] >= pos ) ioff--; + return ioff; +} +#endif + +static int iaux_align_read(indel_aux_t *iaux, bam1_t *bam, uint8_t *ref_seq, int nref_seq) +{ + if ( bam->core.flag & BAM_FUNMAP ) return 1; // skip unmapped reads + + // Trim both ref and qry to the window of interest + hts_pos_t ref_beg = iaux->left; // fa ref coordinates + hts_pos_t ref_end = iaux->right < ref_beg + nref_seq ? iaux->right : ref_beg + nref_seq - 1; + + cigar_state_t cigar; + cstate_init(&cigar,bam); + int qry_off1, qry_off2, ref_off1, ref_off2; + if ( ref_beg > bam->core.pos ) + { + // the read needs trimming from left + qry_off1 = cstate_seek_fwd(&cigar, &ref_beg, 1); + ref_off1 = ref_beg - iaux->left; + + if ( ref_beg + (bam->core.l_qseq - qry_off1) > ref_end ) + { + // the read needs trimming from right + qry_off2 = ref_end - ref_beg + qry_off1; + ref_off2 = ref_end - iaux->left; + } + else + { + // the ref template needs trimming from right + qry_off2 = bam->core.l_qseq - 1; + ref_off2 = ref_off1 + qry_off2 - qry_off1; + } + } + else + { + // the ref template needs trimming from left + qry_off1 = 0; + ref_off1 = bam->core.pos - ref_beg; + + if ( bam->core.pos + bam->core.l_qseq - 1 > ref_end ) + { + // the read needs trimming from right + ref_off2 = ref_end - iaux->left; + qry_off2 = ref_off2 - ref_off1; + } + else + { + // the ref template needs trimming from right + qry_off2 = bam->core.l_qseq - 1; + ref_off2 = ref_off1 + qry_off2 - qry_off1; + } + } +//fprintf(stderr,"xtrim: %s .. left,right=%d,%d rbeg,end=%d,%d qpos=%d qlen=%d qoff=%d,%d roff=%d,%d rlen=%d\n",bam_get_qname(bam),iaux->left,iaux->right,(int)ref_beg,(int)ref_end,(int)bam->core.pos,bam->core.l_qseq, qry_off1,qry_off2,ref_off1,ref_off2,nref_seq); + + assert( qry_off1<=qry_off2 ); + assert( qry_off1>=0 && qry_off1core.l_qseq ); + assert( qry_off2>=0 && qry_off2core.l_qseq ); + + assert( ref_off1<=ref_off2 ); + assert( ref_off1>=0 && ref_off1=0 && ref_off2nqry_seq < qlen ) + { + uint8_t *tmp = (uint8_t*) realloc(iaux->qry_seq, qlen); + if ( !tmp ) return -1; // critical error + iaux->qry_seq = tmp; + iaux->nqry_seq = qlen; + } + uint8_t *seq = bam_get_seq(bam); + for (i=qry_off1; i<=qry_off2; i++) iaux->qry_seq[i-qry_off1] = seq_nt16_int[bam_seqi(seq,i)]; + + // prepare qualities, either BQ or BAQ qualities (ZQ) + if ( iaux->nqual < qlen ) + { + uint8_t *tmp = (uint8_t*) realloc(iaux->qual, qlen); + if ( !tmp ) return -1; // critical error + iaux->qual = tmp; + iaux->nqual = qlen; + } + uint8_t *qual = iaux->qual; + const uint8_t *qq = bam_get_qual(bam); + const uint8_t *bq = (uint8_t*)bam_aux_get(bam, "ZQ"); + if ( bq ) bq++; // skip type + for (i=qry_off1; i<=qry_off2; i++) + { + int j = i - qry_off1; + qual[j] = bq ? qq[i] + (bq[i] - 64) : qq[i]; + if ( qual[j] > 30 ) qual[j] = 30; + if ( qual[j] < 7 ) qual[j] = 7; + } + +// Illumina +probaln_par_t apf = { 1e-4, 1e-2, 10 }; + + // align + int score = probaln_glocal(ref_seq + ref_off1, rlen, iaux->qry_seq, qlen, qual, &apf, 0, 0); + int adj_score = (int)(100. * score / qlen + .499) * iaux->bca->indel_bias; + +#if DEBUG_ALN + fprintf(stderr,"aln: %d/%d\t%s\n\tref: ",score,adj_score,bam_get_qname(bam)); + for (i=0; iqry_seq[i]]); + fprintf(stderr,"\n\tqual: "); + for (i=0; i 255 ) adj_score = 255; + return score<<8 | adj_score; +} + +// Score all reads for this sample and indel type using the up to two consensus sequence templates. +// On output sets iaux->read_scores[iread*ntypes+itype] = (raw_score<<8 | length_adjusted_score) +static int iaux_score_reads(indel_aux_t *iaux, int ismpl, int itype) +{ + int i; + cns_seq_t *cns = iaux->cns_seq; + while ( cns->nseq ) + { + // Resize buffers if necessary + int ref_len = cns->nseq + iaux->types[itype]; + if ( iaux->nref_seq < ref_len ) + { + uint8_t *ref_buf = (uint8_t*) realloc(iaux->ref_seq,sizeof(uint8_t)*ref_len); + if ( !ref_buf ) return -1; + iaux->ref_seq = ref_buf; + iaux->nref_seq = ref_len; + } + + // Apply the indel and create the template ref sequence... + memcpy(iaux->ref_seq,cns->seq,(cns->ipos+1)*sizeof(*iaux->ref_seq)); + if ( iaux->types[itype] < 0 ) // deletion + memcpy(iaux->ref_seq + cns->ipos + 1, cns->seq + cns->ipos + 1 - iaux->types[itype], (cns->nseq - cns->ipos - 1 + iaux->types[itype])*sizeof(*iaux->ref_seq)); + else + { + char *ins = &iaux->inscns[itype*iaux->max_ins_len]; + for (i=0; itypes[itype]; i++) iaux->ref_seq[cns->ipos+1+i] = ins[i]; + memcpy(iaux->ref_seq + cns->ipos + 1 + iaux->types[itype], cns->seq + 1 + cns->ipos, (cns->nseq - cns->ipos - 1)*sizeof(*iaux->ref_seq)); + } + +#if DEBUG_ALN + fprintf(stderr,"template %d, type %d, sample %d: ",cns==iaux->cns_seq?0:1,itype,ismpl); + for (i=0; iref_seq[i]]); + fprintf(stderr,"\n"); +#endif + + // Align and score reads + for (i=0; inplp[ismpl]; i++) + { + const bam_pileup1_t *plp = iaux->plp[ismpl] + i; + int aln_score = iaux_align_read(iaux, plp->b, iaux->ref_seq, ref_len); + int *score = &iaux->read_scores[i*iaux->ntypes+itype]; + if ( cns==iaux->cns_seq || *score > aln_score ) *score = aln_score; + } + cns++; + } + return 0; +} + +// Determines indel quality for each read and populates 22 bits of pileup aux field with +// three integers as follows +// plp->aux = indel_type << 16 | seqQ << 8 | indelQ +static int iaux_eval_scored_reads(indel_aux_t *iaux, int ismpl) +{ + int i,j; + for (i=0; inplp[ismpl]; i++) + { + bam_pileup1_t *plp = iaux->plp[ismpl] + i; + + // Find the best indel type and the ref type, their scores difference is the indel quality + int *score = &iaux->read_scores[i*iaux->ntypes]; + int alt_score = INT_MAX, alt_j = 0; + for (j=0; jiref_type; j++) + if ( alt_score > score[j] ) alt_score = score[j], alt_j = j; + for (j=iaux->iref_type+1; jntypes; j++) + if ( alt_score > score[j] ) alt_score = score[j], alt_j = j; + int ref_score = score[iaux->iref_type]; + int sc0, sc1, j0; + if ( alt_score < ref_score ) sc0 = alt_score, sc1 = ref_score, j0 = alt_j; + else sc0 = ref_score, sc1 = alt_score, j0 = iaux->iref_type; + + int indelQ = (sc1>>8) - (sc0>>8); // low=bad, high=good + int seqQ = iaux->ref_qual[alt_j]; + + // Reduce indelQ. High length-normalized alignment scores (i.e. bad alignments) + // lower the quality more (e.g. gnuplot> plot [0:111] (1-x/111.)*255) + int len_normQ = sc0 & 0xff; // length-normalized score of the best match (ref or alt) + int adj_indelQ; // final indelQ used in calling + if ( len_normQ > 111 ) + { + // In the original code reads matching badly to any indel type or reference had indelQ set to 0 + // here and thus would be effectively removed from calling. This leads to problems when there are + // many soft clipped reads and a few good matching indel reads (see noisy-softclips.bam in + // mpileup-tests). Only the few good quality indel reads would become visible to the caller and + // the indel would be called with high quality. Here we change the logic to make the badly matching + // reads low quality reference reads. The threshold was set to make the test case still be called + // as an indel, but with very low quality. + // + // Original code: + // adj_indelQ = 0; + // + adj_indelQ = 12; + j0 = iaux->iref_type; + } + else + adj_indelQ = (int)((1. - len_normQ/111.) * indelQ + .499); + +#if DEBUG_ALN + // Prints the selected indel type (itype); adjusted indelQ which will be used if bigger than seqQ; + // raw indelQ; length-normalized indelQ and sequence context quality; ref and best alt indel type + // and their raw and length-normalized scores + fprintf(stderr,"itype=%d adj_indelQ=%d\trawQ=%d\tlen_normQ=%d\tseqQ=%d\tref:%d=%d/%d alt:%d=%d/%d)\t%s\n", + j0,adj_indelQ,indelQ,len_normQ,seqQ,iaux->iref_type,ref_score>>8,ref_score&0xff,alt_j,alt_score>>8,alt_score&0xff,bam_get_qname(plp->b)); +#endif + + if ( adj_indelQ > seqQ ) adj_indelQ = seqQ; // seqQ already capped at 255 + plp->aux = j0<<16 | seqQ<<8 | adj_indelQ; // use 22 bits in total + iaux->sum_qual[j0] += adj_indelQ; + } + return 0; +} + +// Find the best indel types, include the ref type plus maximum three alternate indel alleles. +static int iaux_eval_best_indels(indel_aux_t *iaux) +{ + bcf_callaux_t *bca = iaux->bca; + bca->maxins = iaux->max_ins_len; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + if ( bca->maxins && !bca->inscns ) return -1; + + // insertion sort, descending, high-quality indels come first + int i,j,t, tmp, *sumq = iaux->sum_qual, ntypes = iaux->ntypes; + for (t=0; t0 && sumq[j] > sumq[j-1]; j--) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t=0; tiref_type ) break; + if ( t ) + { + // move the reference type to the first + tmp = sumq[t]; + for (; t>0; t--) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + + // Initialize bca's structures and create a mapping between old and new types + int old2new_type[MAX_TYPES]; + for (t=0; tntypes; t++) + { + int itype = sumq[t] & 0x3f; + old2new_type[itype] = t; + if ( t>=4 ) continue; + bca->indel_types[t] = iaux->types[itype]; + if ( bca->indel_types[t] <= 0 ) continue; + memcpy(&bca->inscns[t*bca->maxins], &iaux->inscns[itype*iaux->max_ins_len], bca->maxins); + } + + // Update indel type in plp->aux for all reads + int ismpl, n_alt = 0; + for (ismpl=0; ismplnsmpl; ismpl++) + { + for (i=0; inplp[ismpl]; i++) + { + bam_pileup1_t *plp = iaux->plp[ismpl] + i; + int itype_old = (plp->aux >> 16) & 0x3f; + int itype_new = old2new_type[itype_old]; + plp->aux = itype_new<<16 | (itype_new>=4 ? 0 : (plp->aux & 0xffff)); + if ( itype_new>0 ) n_alt++; + } + } + return n_alt; +} + +/* + notes: + - n .. number of samples + - the routine sets bam_pileup1_t.aux (27 bits) of each read as follows: + - 5: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff + */ +int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +{ +assert(!(ref == 0 || bca == 0)); // can this ever happen? when? + if (ref == 0 || bca == 0) return -1; + + if ( !bca->iaux ) bca->iaux = calloc(1,sizeof(indel_aux_t)); + indel_aux_t *iaux = bca->iaux; + iaux->nsmpl = n; + iaux->nplp = n_plp; + iaux->plp = plp; + iaux->bca = bca; + iaux->ref = ref; + iaux->pos = pos; + iaux->chr = bca->chr; + + // Check if there is an indel at this position and if yes, find all indel types and determine + // window boundaries. todo: We want this information cached so that for long reads we don't keep + // redoing the whole analysis again and again + int ntypes = iaux_init_types(iaux); + if ( ntypes<=0 ) return -1; + + debug_print_types(iaux); + + // Create two template consensus sequences for each sample (assuming max diploid organism). + // Then apply each indel type on top of the templates, realign every read and remember score + int i,j; + for (i=0; insmpl; i++) + { + iaux_set_consensus(iaux, i); + iaux_init_scores(iaux, i); + for (j=0; j 0 ? 0 : -1; +} + diff --git a/bcftools/bam2bcf_iaux.c.pysam.c b/bcftools/bam2bcf_iaux.c.pysam.c new file mode 100644 index 0000000..c8bea99 --- /dev/null +++ b/bcftools/bam2bcf_iaux.c.pysam.c @@ -0,0 +1,739 @@ +#include "bcftools.pysam.h" + +/* bam2bcf_iaux.c -- modified indel caller + + Copyright (C) 2022 Genome Research Ltd. + + Author: pd3@sanger, jkb + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE +*/ + +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "bam2bcf.h" +#include "read_consensus.h" +#include "cigar_state.h" + +#include +KSORT_INIT_STATIC_GENERIC(uint32_t) + +#ifndef DEBUG_ALN +#define DEBUG_ALN 0 +#endif + +#define MAX_TYPES 64 + +typedef struct +{ + int pos; // current position + char *chr; // current chromosome + int nsmpl; // number of samples + int *nplp; // per-sample number of reads + bam_pileup1_t **plp; // per-sample reads + bcf_callaux_t *bca; // auxiliary bam2bcf structure + const char *ref; // reference genome (ASCII) + uint32_t *uitmp; // temporary unsigned int array + char *inscns; // insertions consensus "ACGTN"[itype*max_ins_len+i] + int muitmp, minscns; // size of uitmp, inscns + int iref_type, ntypes, types[MAX_TYPES]; // indel types + int max_ins_len; // largest insertion + int left, right; // consensus sequence boundaries, 0-based fa ref coordinates + read_cns_t *rcns; // read consensus + cns_seq_t *cns_seq; // array of consensus sequences + int *cns_pos; // array of relative pos indexes within cns_seq sequences + uint8_t *ref_seq, *qry_seq; // reference and query sequence to align + int nref_seq, nqry_seq; // the allocated size of ref_seq and qry_seq + uint8_t *qual; + int nqual; + int *read_scores, // read scores for each indel type [ntypes*iread+itype] + mread_scores, + ref_qual[MAX_TYPES], // refseq quality at pos for each indel type in the context of homopolymer runs + sum_qual[MAX_TYPES]; // qual contributions to each indel type from all reads +} +indel_aux_t; + +#if DEBUG_ALN +static void debug_print_types(indel_aux_t *iaux) +{ + int i,j; + fprintf(bcftools_stderr,"types at %s:%d ntypes=%d... ",iaux->chr,iaux->pos+1,iaux->ntypes); + for (i=0; intypes; i++) + { + fprintf(bcftools_stderr," type%d=",i); + if ( iaux->types[i]<=0 ) + { + if ( i==iaux->iref_type ) fprintf(bcftools_stderr,"%d(ref)",iaux->types[i]); + else fprintf(bcftools_stderr,"%d",iaux->types[i]); + continue; + } + char *cns = &iaux->inscns[i*iaux->max_ins_len]; + for (j=0; jtypes[i]; j++) fprintf(bcftools_stderr,"%c","ACGTN"[(int)cns[j]]); + } + fprintf(bcftools_stderr,"\n"); +} +#else +#define debug_print_types(iaux) +#endif + +void bcf_iaux_destroy(bcf_callaux_t *bca) +{ + if ( !bca->iaux ) return; + indel_aux_t *iaux = (indel_aux_t*)bca->iaux; + free(iaux->uitmp); + free(iaux->inscns); + free(iaux->ref_seq); + free(iaux->qry_seq); + free(iaux->qual); + free(iaux->read_scores); + rcns_destroy(iaux->rcns); + free(iaux); +} + +static void iaux_init_sequence_context(indel_aux_t *iaux) +{ + // Calculate left and right boundary. The array types is sorted in ascending order, the first + // element is the largest deletion (if a deletion present) + iaux->left = iaux->pos > iaux->bca->indel_win_size ? iaux->pos - iaux->bca->indel_win_size : 0; + iaux->right = iaux->pos + iaux->bca->indel_win_size; + if ( iaux->types[0] < 0 ) iaux->right -= iaux->types[0]; // extend by the largest deletion length + + // In case the alignments stand out the reference + int i; + for (i=iaux->pos; iright; i++) + if ( !iaux->ref[i] ) break; + iaux->right = i; + + // Sequence quality in the context of homopolymers for each indel type + int l_run = bcf_cgp_l_run(iaux->ref, iaux->pos); // The length of the homopolymer run around the current position + for (i=0; intypes; i++) + { + int l = iaux->types[i]; + + // This is the original est_seqQ() code. FIXME: check if the inserted sequence is consistent with the homopolymer run + int q = iaux->bca->openQ + iaux->bca->extQ * (abs(l) - 1); + int qh = l_run >= 3? (int)(iaux->bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; + if ( q > qh ) q = qh; + + iaux->ref_qual[i] = q < 255 ? q : 255; + } + + // Determine the indel region, this makes the difference between e.g. T>TA vs TA>TAA + iaux->bca->indelreg = 0; + for (i=0; intypes; i++) + { + if ( !iaux->types[i] ) continue; + int ireg; + if ( iaux->types[i] > 0 ) + ireg = est_indelreg(iaux->pos, iaux->ref, iaux->types[i], &iaux->inscns[i*iaux->max_ins_len]); + else + ireg = est_indelreg(iaux->pos, iaux->ref, -iaux->types[i], 0); + if ( ireg > iaux->bca->indelreg ) iaux->bca->indelreg = ireg; + } +} + +static int iaux_init_scores(indel_aux_t *iaux, int ismpl) +{ + int n = iaux->nplp[ismpl] * iaux->ntypes; + if ( iaux->mread_scores < n ) + { + int *tmp = (int*) realloc(iaux->read_scores,n*sizeof(int)); + if ( !tmp ) return -1; + iaux->mread_scores = n; + iaux->read_scores = tmp; + } + memset(iaux->read_scores,0,n); + return 0; +} + +static int _have_indel_reads(indel_aux_t *iaux) +{ + int i,j; + for (i=0; insmpl; i++) + { + for (j=0; jnplp[i]; j++) + if ( iaux->plp[i][j].indel ) return 1; + } + return 0; +} + +// For insertions only their sizes were collected so far. Now go through the reads and +// create consensus sequence for each insert, therefore note that there can be only one +// sequence per insertion length +static int iaux_init_ins_types(indel_aux_t *iaux) +{ + if ( !iaux->max_ins_len ) return 0; + + uint32_t *aux; + int naux = 5 * iaux->ntypes * iaux->max_ins_len; + if ( iaux->muitmp < naux ) + { + aux = (uint32_t*) realloc(iaux->uitmp,naux*sizeof(*aux)); + if ( !aux ) return -1; + iaux->uitmp = aux; + iaux->muitmp = naux; + } + else aux = iaux->uitmp; + memset(aux,0,naux*sizeof(*aux)); + + // count the number of occurrences of each base at each position for each type of insertion + int t,s,i,j; + for (t=0; tntypes; t++) + { + if ( iaux->types[t] <= 0) continue; + for (s=0; snsmpl; s++) + { + for (i=0; inplp[s]; i++) + { + bam_pileup1_t *plp = iaux->plp[s] + i; + if ( plp->indel != iaux->types[t] ) continue; + uint8_t *seq = bam_get_seq(plp->b); + for (j=0; jindel; j++) + { + int c = seq_nt16_int[bam_seqi(seq, plp->qpos+j+1)]; + assert(c<5); + aux[5*(t*iaux->max_ins_len+j) + c]++; + } + } + } + } + + char *cns; + int ncns = iaux->ntypes * iaux->max_ins_len; + if ( iaux->minscns < ncns ) + { + cns = (char*) realloc(iaux->inscns,naux*sizeof(*aux)); + if ( !cns ) return -1; + iaux->inscns = cns; + iaux->minscns = ncns; + } + else cns = iaux->inscns; + memset(aux,0,ncns*sizeof(*cns)); + + // use the majority rule to construct the consensus + for (t=0; tntypes; t++) + { + for (i=0; itypes[t]; i++) // this naturally includes only insertions + { + uint32_t *tmp = &aux[5*(t*iaux->max_ins_len+i)], max = tmp[0], max_j = 0; + for (j=1; j<5; j++) + if ( max < tmp[j] ) max = tmp[j], max_j = j; + cns[t*iaux->max_ins_len + i] = max ? max_j : 4; + if ( max_j==4 ) { iaux->types[t] = 0; break; } // discard insertions which contain N's + } + } + return 0; +} + +#define MINUS_CONST 0x10000000 +static int iaux_init_types(indel_aux_t *iaux) +{ + if ( !_have_indel_reads(iaux) ) return 0; + + iaux->bca->max_support = 0; + memset(iaux->sum_qual,0,MAX_TYPES*sizeof(*iaux->sum_qual)); + + int i,j, nreads = 0; + for (i=0; insmpl; i++) nreads += iaux->nplp[i]; + + uint32_t *aux; + if ( iaux->muitmp < nreads+1 ) + { + aux = (uint32_t*) realloc(iaux->uitmp,(nreads+1)*sizeof(*iaux->uitmp)); + if ( !aux ) return -1; + iaux->uitmp = aux; + iaux->muitmp = nreads+1; + } + else aux = iaux->uitmp; + memset(aux,0,(nreads+1)*sizeof(*aux)); + + int naux = 0, indel_support_ok = 0, n_alt = 0, n_tot = 0; + int max_rd_len = 0; // max sequence length that includes ref+del bases + + // Fill out aux[] array with all the non-zero indel sizes. This is an unsorted list with as many + // entries as there are reads + aux[naux++] = MINUS_CONST; // zero indel is always a type (REF) + for (i=0; insmpl; i++) + { + int nalt = naux, ntot = 0; // per sample values + for (j=0; jnplp[i]; j++) + { + const bam_pileup1_t *plp = iaux->plp[i] + j; + ntot++; + if ( plp->indel ) aux[naux++] = MINUS_CONST + plp->indel; + if ( !PLP_QLEN(&plp->cd) ) PLP_QLEN(&plp->cd) = bam_cigar2qlen(plp->b->core.n_cigar, bam_get_cigar(plp->b)); + if ( PLP_QLEN(&plp->cd) > max_rd_len ) max_rd_len = PLP_QLEN(&plp->cd); + } + nalt = naux - nalt; + if ( iaux->bca->per_sample_flt ) + { + double frac = (double)nalt/naux; + if ( nalt >= iaux->bca->min_support && frac >= iaux->bca->min_frac ) indel_support_ok = 1; + if ( nalt > iaux->bca->max_support && frac > 0 ) iaux->bca->max_support = nalt, iaux->bca->max_frac = frac; + } + else + { + n_alt += nalt; + n_tot += ntot; + } + } + + // Check if the minimum required number of indel reads has been observed + if ( !iaux->bca->per_sample_flt && n_alt >= iaux->bca->min_support && (double)n_alt/n_tot >= iaux->bca->min_frac ) indel_support_ok = 1; + if ( naux==1 || !indel_support_ok ) return 0; + + // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), check the number of N's in the + // sequence and skip places where half or more reference bases in the sequence that follows pos are Ns + int nN = 0, i_end = iaux->pos + (iaux->bca->indel_win_size < max_rd_len ? iaux->bca->indel_win_size : max_rd_len); + for (i=iaux->pos; iref[i]; i++) + if ( iaux->ref[i] == 'N' ) nN++; + if ( 2*nN > i - iaux->pos ) return -1; + + // Sort aux[] and dedup indel types + int n_types = 1; + ks_introsort(uint32_t, naux, aux); + for (i=1; i= MAX_TYPES ) + { + static int warned = 0; + if ( !warned ) + { + fprintf(bcftools_stderr, "Warning: excessive number of INDEL alleles at %s:%d, skipping. (This warning is printed only once)\n",iaux->chr,iaux->pos+1); + warned = 1; + } + return -1; + } + + // Fill out the types[] array detailing the size of insertion or deletion. + iaux->ntypes = 0; + iaux->max_ins_len = 0; + for (i=0; iiref_type = iaux->ntypes; + } + else + { + if ( j-i >= iaux->bca->min_support ) is_ok = 1; + // What is the best way to handle the -pmF options: + // - consider only sites where a single indel type passes the -mF threshold, as opposed to all indel types cumulatively + // - once a site passes, include all indel types in the evaluation, as opposed to considering only the strong candidates + // In this implementation sites are selected by counting reads from all indel types cumulatively and all indel types + // are considered. + // Uncomment the following condition to consider only strong indel candidates once the site has been selected + // if ( !iaux->bca->per_sample_flt && (double)(j-i) / n_tot < iaux->bca->min_frac ) is_ok = 0; + } + if ( is_ok ) + { + iaux->types[iaux->ntypes++] = isize; + if ( isize > 0 && isize > iaux->max_ins_len ) iaux->max_ins_len = isize; + } + i = j-1; + } + if ( iaux->ntypes <= 1 ) return 0; + + // Init insertion types, including their sequence + if ( iaux_init_ins_types(iaux) < 0 ) return -1; + + iaux_init_sequence_context(iaux); + + return iaux->ntypes; +} +#undef MINUS_CONST + +static int iaux_set_consensus(indel_aux_t *iaux, int ismpl) +{ + if ( !iaux->rcns ) + iaux->rcns = rcns_init(iaux->pos, iaux->left, iaux->right); + else + rcns_reset(iaux->rcns, iaux->pos, iaux->left, iaux->right); + + rcns_set_reads(iaux->rcns, iaux->plp[ismpl], iaux->nplp[ismpl]); + + iaux->cns_seq = rcns_get_consensus(iaux->rcns, iaux->ref + iaux->left); + +// todo: +// rcns should also collect localized number of mismatches as a substitute +// for uninformative MQ. This would not affect calling but would help with +// filtering + + return 0; +} + +#if 0 +// Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no +// such value, the largest index with value smaller than pos. Starts at initial guess ioff. +// This could use a binary search but the assumption is that the initial guess is indel-size close +// to the actuall coordinate. +// +// TODO: remove this function and seq_pos from cns creation as it seems unnecessary +static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff) +{ + if ( ioff<0 ) ioff = 0; + else if ( ioff >= nseq_pos ) ioff = nseq_pos - 1; + if ( seq_pos[ioff] < pos ) + { + while ( ioff+1 < nseq_pos && seq_pos[ioff] < pos ) ioff++; + if ( seq_pos[ioff] > pos ) ioff--; + return ioff; + } + while ( ioff > 0 && seq_pos[ioff-1] >= pos ) ioff--; + return ioff; +} +#endif + +static int iaux_align_read(indel_aux_t *iaux, bam1_t *bam, uint8_t *ref_seq, int nref_seq) +{ + if ( bam->core.flag & BAM_FUNMAP ) return 1; // skip unmapped reads + + // Trim both ref and qry to the window of interest + hts_pos_t ref_beg = iaux->left; // fa ref coordinates + hts_pos_t ref_end = iaux->right < ref_beg + nref_seq ? iaux->right : ref_beg + nref_seq - 1; + + cigar_state_t cigar; + cstate_init(&cigar,bam); + int qry_off1, qry_off2, ref_off1, ref_off2; + if ( ref_beg > bam->core.pos ) + { + // the read needs trimming from left + qry_off1 = cstate_seek_fwd(&cigar, &ref_beg, 1); + ref_off1 = ref_beg - iaux->left; + + if ( ref_beg + (bam->core.l_qseq - qry_off1) > ref_end ) + { + // the read needs trimming from right + qry_off2 = ref_end - ref_beg + qry_off1; + ref_off2 = ref_end - iaux->left; + } + else + { + // the ref template needs trimming from right + qry_off2 = bam->core.l_qseq - 1; + ref_off2 = ref_off1 + qry_off2 - qry_off1; + } + } + else + { + // the ref template needs trimming from left + qry_off1 = 0; + ref_off1 = bam->core.pos - ref_beg; + + if ( bam->core.pos + bam->core.l_qseq - 1 > ref_end ) + { + // the read needs trimming from right + ref_off2 = ref_end - iaux->left; + qry_off2 = ref_off2 - ref_off1; + } + else + { + // the ref template needs trimming from right + qry_off2 = bam->core.l_qseq - 1; + ref_off2 = ref_off1 + qry_off2 - qry_off1; + } + } +//fprintf(bcftools_stderr,"xtrim: %s .. left,right=%d,%d rbeg,end=%d,%d qpos=%d qlen=%d qoff=%d,%d roff=%d,%d rlen=%d\n",bam_get_qname(bam),iaux->left,iaux->right,(int)ref_beg,(int)ref_end,(int)bam->core.pos,bam->core.l_qseq, qry_off1,qry_off2,ref_off1,ref_off2,nref_seq); + + assert( qry_off1<=qry_off2 ); + assert( qry_off1>=0 && qry_off1core.l_qseq ); + assert( qry_off2>=0 && qry_off2core.l_qseq ); + + assert( ref_off1<=ref_off2 ); + assert( ref_off1>=0 && ref_off1=0 && ref_off2nqry_seq < qlen ) + { + uint8_t *tmp = (uint8_t*) realloc(iaux->qry_seq, qlen); + if ( !tmp ) return -1; // critical error + iaux->qry_seq = tmp; + iaux->nqry_seq = qlen; + } + uint8_t *seq = bam_get_seq(bam); + for (i=qry_off1; i<=qry_off2; i++) iaux->qry_seq[i-qry_off1] = seq_nt16_int[bam_seqi(seq,i)]; + + // prepare qualities, either BQ or BAQ qualities (ZQ) + if ( iaux->nqual < qlen ) + { + uint8_t *tmp = (uint8_t*) realloc(iaux->qual, qlen); + if ( !tmp ) return -1; // critical error + iaux->qual = tmp; + iaux->nqual = qlen; + } + uint8_t *qual = iaux->qual; + const uint8_t *qq = bam_get_qual(bam); + const uint8_t *bq = (uint8_t*)bam_aux_get(bam, "ZQ"); + if ( bq ) bq++; // skip type + for (i=qry_off1; i<=qry_off2; i++) + { + int j = i - qry_off1; + qual[j] = bq ? qq[i] + (bq[i] - 64) : qq[i]; + if ( qual[j] > 30 ) qual[j] = 30; + if ( qual[j] < 7 ) qual[j] = 7; + } + +// Illumina +probaln_par_t apf = { 1e-4, 1e-2, 10 }; + + // align + int score = probaln_glocal(ref_seq + ref_off1, rlen, iaux->qry_seq, qlen, qual, &apf, 0, 0); + int adj_score = (int)(100. * score / qlen + .499) * iaux->bca->indel_bias; + +#if DEBUG_ALN + fprintf(bcftools_stderr,"aln: %d/%d\t%s\n\tref: ",score,adj_score,bam_get_qname(bam)); + for (i=0; iqry_seq[i]]); + fprintf(bcftools_stderr,"\n\tqual: "); + for (i=0; i 255 ) adj_score = 255; + return score<<8 | adj_score; +} + +// Score all reads for this sample and indel type using the up to two consensus sequence templates. +// On output sets iaux->read_scores[iread*ntypes+itype] = (raw_score<<8 | length_adjusted_score) +static int iaux_score_reads(indel_aux_t *iaux, int ismpl, int itype) +{ + int i; + cns_seq_t *cns = iaux->cns_seq; + while ( cns->nseq ) + { + // Resize buffers if necessary + int ref_len = cns->nseq + iaux->types[itype]; + if ( iaux->nref_seq < ref_len ) + { + uint8_t *ref_buf = (uint8_t*) realloc(iaux->ref_seq,sizeof(uint8_t)*ref_len); + if ( !ref_buf ) return -1; + iaux->ref_seq = ref_buf; + iaux->nref_seq = ref_len; + } + + // Apply the indel and create the template ref sequence... + memcpy(iaux->ref_seq,cns->seq,(cns->ipos+1)*sizeof(*iaux->ref_seq)); + if ( iaux->types[itype] < 0 ) // deletion + memcpy(iaux->ref_seq + cns->ipos + 1, cns->seq + cns->ipos + 1 - iaux->types[itype], (cns->nseq - cns->ipos - 1 + iaux->types[itype])*sizeof(*iaux->ref_seq)); + else + { + char *ins = &iaux->inscns[itype*iaux->max_ins_len]; + for (i=0; itypes[itype]; i++) iaux->ref_seq[cns->ipos+1+i] = ins[i]; + memcpy(iaux->ref_seq + cns->ipos + 1 + iaux->types[itype], cns->seq + 1 + cns->ipos, (cns->nseq - cns->ipos - 1)*sizeof(*iaux->ref_seq)); + } + +#if DEBUG_ALN + fprintf(bcftools_stderr,"template %d, type %d, sample %d: ",cns==iaux->cns_seq?0:1,itype,ismpl); + for (i=0; iref_seq[i]]); + fprintf(bcftools_stderr,"\n"); +#endif + + // Align and score reads + for (i=0; inplp[ismpl]; i++) + { + const bam_pileup1_t *plp = iaux->plp[ismpl] + i; + int aln_score = iaux_align_read(iaux, plp->b, iaux->ref_seq, ref_len); + int *score = &iaux->read_scores[i*iaux->ntypes+itype]; + if ( cns==iaux->cns_seq || *score > aln_score ) *score = aln_score; + } + cns++; + } + return 0; +} + +// Determines indel quality for each read and populates 22 bits of pileup aux field with +// three integers as follows +// plp->aux = indel_type << 16 | seqQ << 8 | indelQ +static int iaux_eval_scored_reads(indel_aux_t *iaux, int ismpl) +{ + int i,j; + for (i=0; inplp[ismpl]; i++) + { + bam_pileup1_t *plp = iaux->plp[ismpl] + i; + + // Find the best indel type and the ref type, their scores difference is the indel quality + int *score = &iaux->read_scores[i*iaux->ntypes]; + int alt_score = INT_MAX, alt_j = 0; + for (j=0; jiref_type; j++) + if ( alt_score > score[j] ) alt_score = score[j], alt_j = j; + for (j=iaux->iref_type+1; jntypes; j++) + if ( alt_score > score[j] ) alt_score = score[j], alt_j = j; + int ref_score = score[iaux->iref_type]; + int sc0, sc1, j0; + if ( alt_score < ref_score ) sc0 = alt_score, sc1 = ref_score, j0 = alt_j; + else sc0 = ref_score, sc1 = alt_score, j0 = iaux->iref_type; + + int indelQ = (sc1>>8) - (sc0>>8); // low=bad, high=good + int seqQ = iaux->ref_qual[alt_j]; + + // Reduce indelQ. High length-normalized alignment scores (i.e. bad alignments) + // lower the quality more (e.g. gnuplot> plot [0:111] (1-x/111.)*255) + int len_normQ = sc0 & 0xff; // length-normalized score of the best match (ref or alt) + int adj_indelQ; // final indelQ used in calling + if ( len_normQ > 111 ) + { + // In the original code reads matching badly to any indel type or reference had indelQ set to 0 + // here and thus would be effectively removed from calling. This leads to problems when there are + // many soft clipped reads and a few good matching indel reads (see noisy-softclips.bam in + // mpileup-tests). Only the few good quality indel reads would become visible to the caller and + // the indel would be called with high quality. Here we change the logic to make the badly matching + // reads low quality reference reads. The threshold was set to make the test case still be called + // as an indel, but with very low quality. + // + // Original code: + // adj_indelQ = 0; + // + adj_indelQ = 12; + j0 = iaux->iref_type; + } + else + adj_indelQ = (int)((1. - len_normQ/111.) * indelQ + .499); + +#if DEBUG_ALN + // Prints the selected indel type (itype); adjusted indelQ which will be used if bigger than seqQ; + // raw indelQ; length-normalized indelQ and sequence context quality; ref and best alt indel type + // and their raw and length-normalized scores + fprintf(bcftools_stderr,"itype=%d adj_indelQ=%d\trawQ=%d\tlen_normQ=%d\tseqQ=%d\tref:%d=%d/%d alt:%d=%d/%d)\t%s\n", + j0,adj_indelQ,indelQ,len_normQ,seqQ,iaux->iref_type,ref_score>>8,ref_score&0xff,alt_j,alt_score>>8,alt_score&0xff,bam_get_qname(plp->b)); +#endif + + if ( adj_indelQ > seqQ ) adj_indelQ = seqQ; // seqQ already capped at 255 + plp->aux = j0<<16 | seqQ<<8 | adj_indelQ; // use 22 bits in total + iaux->sum_qual[j0] += adj_indelQ; + } + return 0; +} + +// Find the best indel types, include the ref type plus maximum three alternate indel alleles. +static int iaux_eval_best_indels(indel_aux_t *iaux) +{ + bcf_callaux_t *bca = iaux->bca; + bca->maxins = iaux->max_ins_len; + bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4); + if ( bca->maxins && !bca->inscns ) return -1; + + // insertion sort, descending, high-quality indels come first + int i,j,t, tmp, *sumq = iaux->sum_qual, ntypes = iaux->ntypes; + for (t=0; t0 && sumq[j] > sumq[j-1]; j--) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t=0; tiref_type ) break; + if ( t ) + { + // move the reference type to the first + tmp = sumq[t]; + for (; t>0; t--) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + + // Initialize bca's structures and create a mapping between old and new types + int old2new_type[MAX_TYPES]; + for (t=0; tntypes; t++) + { + int itype = sumq[t] & 0x3f; + old2new_type[itype] = t; + if ( t>=4 ) continue; + bca->indel_types[t] = iaux->types[itype]; + if ( bca->indel_types[t] <= 0 ) continue; + memcpy(&bca->inscns[t*bca->maxins], &iaux->inscns[itype*iaux->max_ins_len], bca->maxins); + } + + // Update indel type in plp->aux for all reads + int ismpl, n_alt = 0; + for (ismpl=0; ismplnsmpl; ismpl++) + { + for (i=0; inplp[ismpl]; i++) + { + bam_pileup1_t *plp = iaux->plp[ismpl] + i; + int itype_old = (plp->aux >> 16) & 0x3f; + int itype_new = old2new_type[itype_old]; + plp->aux = itype_new<<16 | (itype_new>=4 ? 0 : (plp->aux & 0xffff)); + if ( itype_new>0 ) n_alt++; + } + } + return n_alt; +} + +/* + notes: + - n .. number of samples + - the routine sets bam_pileup1_t.aux (27 bits) of each read as follows: + - 5: unused + - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f + - 8: estimated sequence quality .. (aux>>8)&0xff + - 8: indel quality .. aux&0xff + */ +int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref) +{ +assert(!(ref == 0 || bca == 0)); // can this ever happen? when? + if (ref == 0 || bca == 0) return -1; + + if ( !bca->iaux ) bca->iaux = calloc(1,sizeof(indel_aux_t)); + indel_aux_t *iaux = bca->iaux; + iaux->nsmpl = n; + iaux->nplp = n_plp; + iaux->plp = plp; + iaux->bca = bca; + iaux->ref = ref; + iaux->pos = pos; + iaux->chr = bca->chr; + + // Check if there is an indel at this position and if yes, find all indel types and determine + // window boundaries. todo: We want this information cached so that for long reads we don't keep + // redoing the whole analysis again and again + int ntypes = iaux_init_types(iaux); + if ( ntypes<=0 ) return -1; + + debug_print_types(iaux); + + // Create two template consensus sequences for each sample (assuming max diploid organism). + // Then apply each indel type on top of the templates, realign every read and remember score + int i,j; + for (i=0; insmpl; i++) + { + iaux_set_consensus(iaux, i); + iaux_init_scores(iaux, i); + for (j=0; j 0 ? 0 : -1; +} + diff --git a/bcftools/bam2bcf_indel.c b/bcftools/bam2bcf_indel.c index 108d505..faedc3f 100644 --- a/bcftools/bam2bcf_indel.c +++ b/bcftools/bam2bcf_indel.c @@ -84,7 +84,7 @@ static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) return q < qh? q : qh; } -static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +inline int est_indelreg(int pos, const char *ref, int l, char *ins4) { int i, j, max = 0, max_i = pos, score = 0; l = abs(l); @@ -408,7 +408,7 @@ static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, } // The length of the homopolymer run around the current position -static int bcf_cgp_l_run(const char *ref, int pos) { +int bcf_cgp_l_run(const char *ref, int pos) { int i, l_run; int c = seq_nt16_table[(int)ref[pos + 1]]; @@ -922,7 +922,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " "qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), - qbeg, tbeg, sc); + qbeg, tbeg, score[K*n_types + t]); #endif } } diff --git a/bcftools/bam2bcf_indel.c.pysam.c b/bcftools/bam2bcf_indel.c.pysam.c index c2287de..65a7179 100644 --- a/bcftools/bam2bcf_indel.c.pysam.c +++ b/bcftools/bam2bcf_indel.c.pysam.c @@ -86,7 +86,7 @@ static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) return q < qh? q : qh; } -static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +inline int est_indelreg(int pos, const char *ref, int l, char *ins4) { int i, j, max = 0, max_i = pos, score = 0; l = abs(l); @@ -410,7 +410,7 @@ static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, } // The length of the homopolymer run around the current position -static int bcf_cgp_l_run(const char *ref, int pos) { +int bcf_cgp_l_run(const char *ref, int pos) { int i, l_run; int c = seq_nt16_table[(int)ref[pos + 1]]; @@ -924,7 +924,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s " "qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam_get_qname(p->b), - qbeg, tbeg, sc); + qbeg, tbeg, score[K*n_types + t]); #endif } } diff --git a/bcftools/bcftools.h b/bcftools/bcftools.h index a915802..c3f7ded 100644 --- a/bcftools/bcftools.h +++ b/bcftools/bcftools.h @@ -57,8 +57,6 @@ char *init_tmp_prefix(const char *prefix); int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq); int parse_overlap_option(const char *arg); -void *smalloc(size_t size); // safe malloc - static inline int iupac2bitmask(char iupac) { const int A = 1; @@ -99,7 +97,7 @@ static inline int iupac_consistent(char iupac, char nt) 13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10 }; if ( iupac > 89 ) return 0; - if ( nt > 90 ) nt -= 32; // lowercase + if ( nt > 90 ) nt -= 32; // lowercase if ( nt=='A' ) nt = 1; else if ( nt=='C' ) nt = 2; else if ( nt=='G' ) nt = 4; diff --git a/bcftools/cigar_state.h b/bcftools/cigar_state.h new file mode 100644 index 0000000..a12a709 --- /dev/null +++ b/bcftools/cigar_state.h @@ -0,0 +1,182 @@ +/* cigar_state.h -- API for efficient parsing of CIGAR strings + + Copyright (C) 2022 Genome Research Ltd. + + Author: pd3@sanger + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. */ + +#ifndef CIGAR_STATE_H +#define CIGAR_STATE_H + +#include +#include +#include + +typedef struct +{ + bam1_t *bam; + uint32_t *cigar; + uint8_t *seq; + int ncig; + int icig; // position in the cigar string + int iseq; // the cigar[icigar] operation refers to &seq[iseq] + hts_pos_t ref_pos; // reference coordinate, corresponds to iseq; points to + // the first base after the read when consumed +} +cigar_state_t; + +static inline void cstate_init(cigar_state_t *cs, bam1_t *bam) +{ + cs->bam = bam; + cs->cigar = bam_get_cigar(bam); + cs->seq = bam_get_seq(bam); + cs->ncig = bam->core.n_cigar; + cs->icig = 0; + cs->iseq = 0; + cs->ref_pos = bam->core.pos; +} + +/** + * cstate_seek_fwd() - Move in the cigar forward to find query index that + * matches the reference position. + * + * When the position is not contained within the sequence, either because there + * is a deletion or there is no overlap, the behavior is controlled by the value + * of trim_left: + * - read starts after: qry_beg > pos && trim_left=1 .. returns 0 and sets pos to qry_beg + * - read starts after: qry_beg > pos && trim_left=0 .. returns -1 + * - read ends before: qry_end < pos && trim_left=1 .. returns -2 + * - read ends before: qry_end < pos && trim_left=0 .. returns qry_len-1 and sets pos to qry_end + * - pos inside a deletion && trim_left=1 .. returns position after the deletion + * - pos inside a deletion && trim_left=0 .. returns position before the deletion + */ +static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int trim_left) +{ + hts_pos_t pos = *pos_ptr; + while ( cs->ref_pos <= pos ) + { + if ( cs->icig >= cs->ncig ) // the read ends before pos + { + if ( trim_left ) return -2; + *pos_ptr = cs->ref_pos - 1; + return cs->iseq - 1; + } + + int op = cs->cigar[cs->icig] & BAM_CIGAR_MASK; + int len = cs->cigar[cs->icig] >> BAM_CIGAR_SHIFT; + if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF ) + { + if ( cs->ref_pos + len > pos ) return pos - cs->ref_pos + cs->iseq; // the cigar op overlaps pos + cs->ref_pos += len; + cs->iseq += len; + cs->icig++; + continue; + } + if ( op==BAM_CINS || op==BAM_CSOFT_CLIP ) + { + cs->iseq += len; + cs->icig++; + continue; + } + if ( op==BAM_CDEL || op==BAM_CREF_SKIP ) + { + if ( cs->ref_pos + len > pos ) + { + // The deletion overlaps the position. NB: assuming del is never the first or last op + *pos_ptr = trim_left ? cs->ref_pos + len : cs->ref_pos - 1; + return trim_left ? cs->iseq : cs->iseq - 1; + } + cs->ref_pos += len; + cs->icig++; + continue; + } + } + // the read starts after pos + if ( trim_left ) + { + *pos_ptr = cs->bam->core.pos; + return 0; + } + return -1; +} + + +/** + * cstate_seek_op_fwd() - Move in the cigar forward to find query index that + * matches the seek operator and the reference position. + * + * In order to match a deletion, pass the position of the first deleted base. + * In order to match an insertion, pass the reference coordinate of the base + * after the inserted sequence. + * + * Returns the index to the query sequence cs->seq + * on success; -1 when there is no such matching position but the cigar + * is still not entirely consumed (e.g. a deletion or a soft-clip); -2 + * when there is no overlap (i.e. the read ends before the position). + */ +static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_op, int *oplen) +{ + while ( cs->ref_pos <= pos ) + { + if ( cs->icig >= cs->ncig ) return -2; + + int op = cs->cigar[cs->icig] & BAM_CIGAR_MASK; + int len = cs->cigar[cs->icig] >> BAM_CIGAR_SHIFT; + if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF ) + { + if ( cs->ref_pos + len <= pos ) + { + cs->ref_pos += len; + cs->iseq += len; + cs->icig++; + continue; + } + if ( seek_op==BAM_CMATCH ) return pos - cs->ref_pos + cs->iseq; + return -1; + } + if ( op==BAM_CINS || op==BAM_CSOFT_CLIP ) + { + if ( cs->ref_pos == pos && seek_op==op ) + { + if ( oplen ) *oplen = len; + return cs->iseq; + } + if ( cs->ref_pos >= pos ) return -1; + cs->iseq += len; + cs->icig++; + continue; + } + if ( op==BAM_CDEL || op==BAM_CREF_SKIP ) + { + if ( cs->ref_pos == pos && seek_op==op ) + { + if ( oplen ) *oplen = len; + return cs->iseq; + } + if ( cs->ref_pos >= pos ) return -1; + cs->ref_pos += len; + cs->icig++; + continue; + } + } + return cs->icig < cs->ncig ? -1 : -2; +} + +#endif diff --git a/bcftools/consensus.c b/bcftools/consensus.c index 84ae905..397d45f 100644 --- a/bcftools/consensus.c +++ b/bcftools/consensus.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2014-2022 Genome Research Ltd. + Copyright (c) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -42,6 +42,7 @@ #include "bcftools.h" #include "rbuf.h" #include "filter.h" +#include "smpl_ilist.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -115,11 +116,12 @@ typedef struct FILE *fp_out; FILE *fp_chain; char **argv; - int argc, output_iupac, haplotype, allele, isample, napplied; - uint8_t *iupac_bitmask; - int miupac_bitmask; - char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; + int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied; + uint8_t *iupac_bitmask, *iupac_als; + int miupac_bitmask, miupac_als; + char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; char mark_del, mark_ins, mark_snv; + smpl_ilist_t *smpl; } args_t; @@ -226,15 +228,27 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); args->hdr = args->files->readers[0].header; args->isample = -1; - if ( args->sample ) + if ( !args->sample ) + args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); + else if ( args->sample && strcmp("-",args->sample) ) { - args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample); - if ( args->isample<0 ) error("No such sample: %s\n", args->sample); + args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE); + if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); } - if ( (args->haplotype || args->allele) && args->isample<0 ) + else if ( args->sample_fname ) { - if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); - args->isample = 0; + args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE); + if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); + } + if ( args->smpl ) + { + if ( args->haplotype || args->allele ) + { + if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n"); + args->isample = args->smpl->idx[0]; + } + else + args->iupac_GTs = 1; } int i; for (i=0; inmask; i++) @@ -258,7 +272,7 @@ static void init_data(args_t *args) if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = stdout; - if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n"); + if ( args->isample<0 && !args->iupac_GTs ) fprintf(stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); args->rid = -1; @@ -282,8 +296,10 @@ static void add_mask_with(args_t *args, char *with) } static void destroy_data(args_t *args) { + free(args->iupac_als); free(args->iupac_bitmask); if (args->filter) filter_destroy(args->filter); + if ( args->smpl ) smpl_ilist_destroy(args->smpl); bcf_sr_destroy(args->files); int i; for (i=0; ivcf_rbuf.m; i++) @@ -470,6 +486,59 @@ static void mark_snv(char *ref, char *alt, char mark) if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]); } } +static void iupac_init(args_t *args, bcf1_t *rec) +{ + int i; + hts_resize(uint8_t, rec->n_allele, &args->miupac_als, &args->iupac_als, 0); + for (i=0; imiupac_als; i++) args->iupac_als[i] = 0; +} +static int iupac_add_gt(args_t *args, bcf1_t *rec, uint8_t *gt, int ngt) +{ + int i, is_set = 0; + for (i=0; i= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + args->iupac_als[ial] = 1; + is_set = 1; + } + return is_set; +} +static int iupac_set_allele(args_t *args, bcf1_t *rec) +{ + int i,j, max_len = 0, alt_len = 0, ialt = -1, fallback_alt = -1; + for (i=0; in_allele; i++) + { + if ( !args->iupac_als[i] ) continue; + if ( fallback_alt <=0 ) fallback_alt = i; + int l = strlen(rec->d.allele[i]); + for (j=0; jd.allele[i][j]) < 0 ) break; + if ( jmax_len ) + { + hts_resize(uint8_t, l, &args->miupac_bitmask, &args->iupac_bitmask, HTS_RESIZE_CLEAR); + for (j=max_len; jiupac_bitmask[j] = 0; + max_len = l; + } + if ( i>0 && l>alt_len ) + { + alt_len = l; + ialt = i; + } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]); + } + if ( alt_len > 0 ) + { + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + return ialt; + } + if ( fallback_alt >= 0 ) return fallback_alt; + return ialt; +} static void apply_variant(args_t *args, bcf1_t *rec) { static int warned_haplotype = 0; @@ -491,7 +560,25 @@ static void apply_variant(args_t *args, bcf1_t *rec) } int ialt = 1; // the alternate allele - if ( args->isample >= 0 ) + if ( args->iupac_GTs ) + { + bcf_unpack(rec, BCF_UN_FMT); + bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); + if ( !fmt ) return; + if ( fmt->type!=BCF_BT_INT8 ) + error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + ialt = -1; + int is_set = 0; + iupac_init(args,rec); + for (i=0; ismpl->n; i++) + { + uint8_t *ptr = fmt->p + fmt->size*args->smpl->idx[i]; + is_set += iupac_add_gt(args, rec, ptr, fmt->n); + } + if ( !is_set && !args->missing_allele ) return; + if ( is_set ) ialt = iupac_set_allele(args, rec); + } + else if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); @@ -544,39 +631,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) else if ( action==use_iupac ) { ialt = -1; - int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; - for (i=0; in; i++) - { - if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; } - if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; - int jalt = bcf_gt_allele(ptr[i]); - if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( fallback_alt <= 0 ) fallback_alt = jalt; - - int l = strlen(rec->d.allele[jalt]); - for (j=0; jd.allele[jalt][j]) < 0 ) break; - if ( j mlen ) - { - hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); - for (j=mlen; jiupac_bitmask[j] = 0; - mlen = l; - } - if ( jalt>0 && l>alen ) - { - alen = l; - ialt = jalt; - } - for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]); - } - if ( alen > 0 ) - for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); - else if ( fallback_alt >= 0 ) - ialt = fallback_alt; - else if ( is_missing && !args->missing_allele ) return; + iupac_init(args,rec); + int is_set = iupac_add_gt(args, rec, ptr, fmt->n); + if ( !is_set && !args->missing_allele ) return; + if ( is_set ) ialt = iupac_set_allele(args, rec); } else { @@ -1030,16 +1088,16 @@ static void usage(args_t *args) fprintf(stderr, "\n"); fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n"); fprintf(stderr, " file. By default, the program will apply all ALT variants. Using the\n"); - fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); + fprintf(stderr, " --samples (and, optionally, --haplotype) option will apply genotype\n"); fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(stderr, "Options:\n"); - fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n"); - fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); - fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); - fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(stderr, " -c, --chain FILE Write a chain file for liftover\n"); + fprintf(stderr, " -a, --absent CHAR Replace positions absent from VCF with CHAR\n"); + fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); + fprintf(stderr, " -H, --haplotype WHICH Choose which allele to use from the FORMAT/GT field, note\n"); fprintf(stderr, " the codes are case-insensitive:\n"); fprintf(stderr, " 1: first allele from GT, regardless of phasing\n"); fprintf(stderr, " 2: second allele from GT, regardless of phasing\n"); @@ -1049,17 +1107,18 @@ static void usage(args_t *args) fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); - fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); - fprintf(stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n"); - fprintf(stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); - fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); - fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); - fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n"); + fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(stderr, " -I, --iupac-codes Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n"); + fprintf(stderr, " --mark-del CHAR Instead of removing sequence, insert CHAR for deletions\n"); + fprintf(stderr, " --mark-ins uc|lc Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " --mark-snv uc|lc Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(stderr, " -m, --mask FILE Replace regions according to the next --mask-with option. The default is --mask-with N\n"); + fprintf(stderr, " --mask-with CHAR|uc|lc Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); + fprintf(stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n"); + fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(stderr, " -p, --prefix STRING Prefix to add to output sequence names\n"); + fprintf(stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n"); + fprintf(stderr, " -S, --samples-file FILE File of samples to include\n"); fprintf(stderr, "Examples:\n"); fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(stderr, " # in the form \">chr:from-to\".\n"); @@ -1084,6 +1143,8 @@ int main_consensus(int argc, char *argv[]) {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, {"iupac-codes",0,0,'I'}, {"haplotype",1,0,'H'}, {"output",1,0,'o'}, @@ -1096,7 +1157,7 @@ int main_consensus(int argc, char *argv[]) {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) { switch (c) { @@ -1113,6 +1174,7 @@ int main_consensus(int argc, char *argv[]) break; case 'p': args->chr_prefix = optarg; break; case 's': args->sample = optarg; break; + case 'S': args->sample_fname = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; case 'e': diff --git a/bcftools/consensus.c.pysam.c b/bcftools/consensus.c.pysam.c index 4af9c18..b611925 100644 --- a/bcftools/consensus.c.pysam.c +++ b/bcftools/consensus.c.pysam.c @@ -2,7 +2,7 @@ /* The MIT License - Copyright (c) 2014-2022 Genome Research Ltd. + Copyright (c) 2014-2023 Genome Research Ltd. Author: Petr Danecek @@ -44,6 +44,7 @@ #include "bcftools.h" #include "rbuf.h" #include "filter.h" +#include "smpl_ilist.h" // Logic of the filters: include or exclude sites which match the filters? #define FLT_INCLUDE 1 @@ -117,11 +118,12 @@ typedef struct FILE *fp_out; FILE *fp_chain; char **argv; - int argc, output_iupac, haplotype, allele, isample, napplied; - uint8_t *iupac_bitmask; - int miupac_bitmask; - char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; + int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied; + uint8_t *iupac_bitmask, *iupac_als; + int miupac_bitmask, miupac_als; + char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele; char mark_del, mark_ins, mark_snv; + smpl_ilist_t *smpl; } args_t; @@ -228,15 +230,27 @@ static void init_data(args_t *args) if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum)); args->hdr = args->files->readers[0].header; args->isample = -1; - if ( args->sample ) + if ( !args->sample ) + args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE); + else if ( args->sample && strcmp("-",args->sample) ) { - args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample); - if ( args->isample<0 ) error("No such sample: %s\n", args->sample); + args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE); + if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); } - if ( (args->haplotype || args->allele) && args->isample<0 ) + else if ( args->sample_fname ) { - if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); - args->isample = 0; + args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE); + if ( args->smpl && !args->smpl->n ) error("No matching sample found\n"); + } + if ( args->smpl ) + { + if ( args->haplotype || args->allele ) + { + if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n"); + args->isample = args->smpl->idx[0]; + } + else + args->iupac_GTs = 1; } int i; for (i=0; inmask; i++) @@ -260,7 +274,7 @@ static void init_data(args_t *args) if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = bcftools_stdout; - if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n"); + if ( args->isample<0 && !args->iupac_GTs ) fprintf(bcftools_stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); args->rid = -1; @@ -284,8 +298,10 @@ static void add_mask_with(args_t *args, char *with) } static void destroy_data(args_t *args) { + free(args->iupac_als); free(args->iupac_bitmask); if (args->filter) filter_destroy(args->filter); + if ( args->smpl ) smpl_ilist_destroy(args->smpl); bcf_sr_destroy(args->files); int i; for (i=0; ivcf_rbuf.m; i++) @@ -472,6 +488,59 @@ static void mark_snv(char *ref, char *alt, char mark) if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]); } } +static void iupac_init(args_t *args, bcf1_t *rec) +{ + int i; + hts_resize(uint8_t, rec->n_allele, &args->miupac_als, &args->iupac_als, 0); + for (i=0; imiupac_als; i++) args->iupac_als[i] = 0; +} +static int iupac_add_gt(args_t *args, bcf1_t *rec, uint8_t *gt, int ngt) +{ + int i, is_set = 0; + for (i=0; i= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + args->iupac_als[ial] = 1; + is_set = 1; + } + return is_set; +} +static int iupac_set_allele(args_t *args, bcf1_t *rec) +{ + int i,j, max_len = 0, alt_len = 0, ialt = -1, fallback_alt = -1; + for (i=0; in_allele; i++) + { + if ( !args->iupac_als[i] ) continue; + if ( fallback_alt <=0 ) fallback_alt = i; + int l = strlen(rec->d.allele[i]); + for (j=0; jd.allele[i][j]) < 0 ) break; + if ( jmax_len ) + { + hts_resize(uint8_t, l, &args->miupac_bitmask, &args->iupac_bitmask, HTS_RESIZE_CLEAR); + for (j=max_len; jiupac_bitmask[j] = 0; + max_len = l; + } + if ( i>0 && l>alt_len ) + { + alt_len = l; + ialt = i; + } + for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]); + } + if ( alt_len > 0 ) + { + for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); + return ialt; + } + if ( fallback_alt >= 0 ) return fallback_alt; + return ialt; +} static void apply_variant(args_t *args, bcf1_t *rec) { static int warned_haplotype = 0; @@ -493,7 +562,25 @@ static void apply_variant(args_t *args, bcf1_t *rec) } int ialt = 1; // the alternate allele - if ( args->isample >= 0 ) + if ( args->iupac_GTs ) + { + bcf_unpack(rec, BCF_UN_FMT); + bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); + if ( !fmt ) return; + if ( fmt->type!=BCF_BT_INT8 ) + error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); + ialt = -1; + int is_set = 0; + iupac_init(args,rec); + for (i=0; ismpl->n; i++) + { + uint8_t *ptr = fmt->p + fmt->size*args->smpl->idx[i]; + is_set += iupac_add_gt(args, rec, ptr, fmt->n); + } + if ( !is_set && !args->missing_allele ) return; + if ( is_set ) ialt = iupac_set_allele(args, rec); + } + else if ( args->isample >= 0 ) { bcf_unpack(rec, BCF_UN_FMT); bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); @@ -546,39 +633,10 @@ static void apply_variant(args_t *args, bcf1_t *rec) else if ( action==use_iupac ) { ialt = -1; - int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1; - for (i=0; in; i++) - { - if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; } - if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break; - int jalt = bcf_gt_allele(ptr[i]); - if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1); - if ( fallback_alt <= 0 ) fallback_alt = jalt; - - int l = strlen(rec->d.allele[jalt]); - for (j=0; jd.allele[jalt][j]) < 0 ) break; - if ( j mlen ) - { - hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask); - for (j=mlen; jiupac_bitmask[j] = 0; - mlen = l; - } - if ( jalt>0 && l>alen ) - { - alen = l; - ialt = jalt; - } - for (j=0; jiupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]); - } - if ( alen > 0 ) - for (j=0; jd.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]); - else if ( fallback_alt >= 0 ) - ialt = fallback_alt; - else if ( is_missing && !args->missing_allele ) return; + iupac_init(args,rec); + int is_set = iupac_add_gt(args, rec, ptr, fmt->n); + if ( !is_set && !args->missing_allele ) return; + if ( is_set ) ialt = iupac_set_allele(args, rec); } else { @@ -1032,16 +1090,16 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "\n"); fprintf(bcftools_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n"); fprintf(bcftools_stderr, " file. By default, the program will apply all ALT variants. Using the\n"); - fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n"); + fprintf(bcftools_stderr, " --samples (and, optionally, --haplotype) option will apply genotype\n"); fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n"); fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n"); fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] \n"); fprintf(bcftools_stderr, "Options:\n"); - fprintf(bcftools_stderr, " -c, --chain FILE write a chain file for liftover\n"); - fprintf(bcftools_stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n"); - fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n"); - fprintf(bcftools_stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n"); + fprintf(bcftools_stderr, " -c, --chain FILE Write a chain file for liftover\n"); + fprintf(bcftools_stderr, " -a, --absent CHAR Replace positions absent from VCF with CHAR\n"); + fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n"); + fprintf(bcftools_stderr, " -H, --haplotype WHICH Choose which allele to use from the FORMAT/GT field, note\n"); fprintf(bcftools_stderr, " the codes are case-insensitive:\n"); fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n"); fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n"); @@ -1051,17 +1109,18 @@ static void usage(args_t *args) fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n"); fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n"); fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n"); - fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n"); - fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n"); - fprintf(bcftools_stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n"); - fprintf(bcftools_stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(bcftools_stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); - fprintf(bcftools_stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n"); - fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); - fprintf(bcftools_stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n"); - fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n"); - fprintf(bcftools_stderr, " -p, --prefix STRING prefix to add to output sequence names\n"); - fprintf(bcftools_stderr, " -s, --sample NAME apply variants of the given sample\n"); + fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n"); + fprintf(bcftools_stderr, " -I, --iupac-codes Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n"); + fprintf(bcftools_stderr, " --mark-del CHAR Instead of removing sequence, insert CHAR for deletions\n"); + fprintf(bcftools_stderr, " --mark-ins uc|lc Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(bcftools_stderr, " --mark-snv uc|lc Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n"); + fprintf(bcftools_stderr, " -m, --mask FILE Replace regions according to the next --mask-with option. The default is --mask-with N\n"); + fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n"); + fprintf(bcftools_stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n"); + fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n"); + fprintf(bcftools_stderr, " -p, --prefix STRING Prefix to add to output sequence names\n"); + fprintf(bcftools_stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n"); + fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n"); fprintf(bcftools_stderr, "Examples:\n"); fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n"); fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n"); @@ -1086,6 +1145,8 @@ int main_consensus(int argc, char *argv[]) {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"sample",1,0,'s'}, + {"samples",1,0,'s'}, + {"samples-file",1,0,'S'}, {"iupac-codes",0,0,'I'}, {"haplotype",1,0,'H'}, {"output",1,0,'o'}, @@ -1098,7 +1159,7 @@ int main_consensus(int argc, char *argv[]) {0,0,0,0} }; int c; - while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) + while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0) { switch (c) { @@ -1115,6 +1176,7 @@ int main_consensus(int argc, char *argv[]) break; case 'p': args->chr_prefix = optarg; break; case 's': args->sample = optarg; break; + case 'S': args->sample_fname = optarg; break; case 'o': args->output_fname = optarg; break; case 'I': args->output_iupac = 1; break; case 'e': diff --git a/bcftools/convert.c b/bcftools/convert.c index 5317cb8..80e5474 100644 --- a/bcftools/convert.c +++ b/bcftools/convert.c @@ -1,6 +1,6 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -1117,14 +1117,21 @@ static void process_rsid_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int i static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { + const char *alt = NULL; + size_t sizealt = 0; + if ( line->n_allele>1 ) + { + alt = line->d.allele[1]; + sizealt = strlen(line->d.allele[1]); + } uint64_t vk = variantkey( convert->header->id[BCF_DT_CTG][line->rid].key, strlen(convert->header->id[BCF_DT_CTG][line->rid].key), line->pos, line->d.allele[0], strlen(line->d.allele[0]), - line->d.allele[1], - strlen(line->d.allele[1])); + alt, + sizealt); ksprintf(str, "%016" PRIx64 "", vk); } @@ -1561,7 +1568,7 @@ int convert_header(convert_t *convert, kstring_t *str) if ( i!=convert->nfmt ) return str->l - l_ori; - kputs("# ", str); + kputc('#', str); for (i=0; infmt; i++) { // Genotype fields diff --git a/bcftools/convert.c.pysam.c b/bcftools/convert.c.pysam.c index 6b9e851..92f9d01 100644 --- a/bcftools/convert.c.pysam.c +++ b/bcftools/convert.c.pysam.c @@ -2,7 +2,7 @@ /* convert.c -- functions for converting between VCF/BCF and related formats. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -1119,14 +1119,21 @@ static void process_rsid_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int i static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { + const char *alt = NULL; + size_t sizealt = 0; + if ( line->n_allele>1 ) + { + alt = line->d.allele[1]; + sizealt = strlen(line->d.allele[1]); + } uint64_t vk = variantkey( convert->header->id[BCF_DT_CTG][line->rid].key, strlen(convert->header->id[BCF_DT_CTG][line->rid].key), line->pos, line->d.allele[0], strlen(line->d.allele[0]), - line->d.allele[1], - strlen(line->d.allele[1])); + alt, + sizealt); ksprintf(str, "%016" PRIx64 "", vk); } @@ -1563,7 +1570,7 @@ int convert_header(convert_t *convert, kstring_t *str) if ( i!=convert->nfmt ) return str->l - l_ori; - kputs("# ", str); + kputc('#', str); for (i=0; infmt; i++) { // Genotype fields diff --git a/bcftools/csq.c b/bcftools/csq.c index de0d7a9..49812d4 100644 --- a/bcftools/csq.c +++ b/bcftools/csq.c @@ -1,19 +1,19 @@ /* The MIT License - Copyright (c) 2016-2021 Genome Research Ltd. + Copyright (c) 2016-2023 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -76,12 +76,12 @@ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/ C .. corresponding CDS, exon, and UTR lines: - C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ + C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the complete chain link C -> B -> A is required. For the rest, link B -> A suffices. - - + + The supported consequence types, sorted by impact: splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron) splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron) @@ -119,18 +119,18 @@ (based on biotype) which maps from transcript_id to a transcript. At the same time also build the hash "gid2gene" which maps from gene_id to gf_gene_t pointer. - + 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes. Use only features from "ftr" which are present in "id2tr". 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene. - + Data structures. idx_cds, idx_utr, idx_exon, idx_tscript: as described above, regidx structures for fast lookup of exons/transcripts overlapping a region, the payload is a pointer to tscript.cds */ - + #include #include #include @@ -163,9 +163,9 @@ #define FLT_EXCLUDE 2 // Definition of splice_region, splice_acceptor and splice_donor -#define N_SPLICE_DONOR 2 -#define N_SPLICE_REGION_EXON 3 -#define N_SPLICE_REGION_INTRON 8 +#define N_SPLICE_DONOR 2 +#define N_SPLICE_REGION_EXON 3 +#define N_SPLICE_REGION_INTRON 8 #define N_REF_PAD 10 // number of bases to avoid boundary effects @@ -186,7 +186,7 @@ // Node types in the haplotype tree #define HAP_CDS 0 -#define HAP_ROOT 1 +#define HAP_ROOT 1 #define HAP_SSS 2 // start/stop/splice #define CSQ_PRINTED_UPSTREAM (1<<0) @@ -226,25 +226,25 @@ #define CSQ_PRN_BIOTYPE CSQ_NON_CODING // see kput_vcsq() -const char *csq_strings[] = +const char *csq_strings[] = { - NULL, - "synonymous", - "missense", - "stop_lost", - "stop_gained", - "inframe_deletion", - "inframe_insertion", - "frameshift", - "splice_acceptor", - "splice_donor", - "start_lost", - "splice_region", - "stop_retained", - "5_prime_utr", - "3_prime_utr", - "non_coding", - "intron", + NULL, + "synonymous", + "missense", + "stop_lost", + "stop_gained", + "inframe_deletion", + "inframe_insertion", + "frameshift", + "splice_acceptor", + "splice_donor", + "start_lost", + "splice_region", + "stop_retained", + "5_prime_utr", + "3_prime_utr", + "non_coding", + "intron", "intergenic", "inframe_altering", NULL, @@ -256,11 +256,12 @@ const char *csq_strings[] = // GFF line types +#define GFF_UNKN_LINE 0 #define GFF_TSCRIPT_LINE 1 #define GFF_GENE_LINE 2 -/* +/* Genomic features, for fast lookup by position to overlapping features */ #define GF_coding_bit 6 @@ -505,9 +506,9 @@ hap_t; /* Helper structures, only for initialization - + ftr_t - temporary list of all exons, CDS, UTRs + temporary list of all exons, CDS, UTRs */ KHASH_MAP_INIT_INT(int2tscript, tscript_t*) KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) @@ -596,7 +597,7 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; - + int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing hap_t *hap; // transcript haplotype recursion @@ -644,13 +645,13 @@ const uint8_t cnt4[] = #define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] -static const char *gf_strings_noncoding[] = -{ +static const char *gf_strings_noncoding[] = +{ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", - "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", - "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", - "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" @@ -755,10 +756,11 @@ static void gff_id_destroy(id_tbl_t *tbl) khash_str2int_destroy_free(tbl->str2id); free(tbl->str); } -static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss) +// returns 0 on success, -1 on failure +static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr) { ss = strstr(ss,needle); // e.g. "ID=transcript:" - if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); + if ( !ss ) return -1; ss += strlen(needle); char *se = ss; @@ -775,8 +777,8 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char khash_str2int_set(tbl->str2id, tbl->str[id], id); } *se = tmp; - - return id; + *id_ptr = id; + return 0; } static inline int gff_parse_type(char *line) { @@ -795,7 +797,7 @@ static inline int gff_parse_biotype(char *_line) line += 8; switch (*line) { - case 'p': + case 'p': if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; @@ -859,7 +861,7 @@ static inline int gff_parse_biotype(char *_line) case 't': if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; @@ -931,13 +933,34 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) int biotype = gff_parse_biotype(ss); if ( biotype <= 0 ) { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript: %s\n",line); + if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line); return; } // create a mapping from transcript_id to gene_id - uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss); - uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss); + uint32_t trid, gene_id; + if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) ) + { + if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) ) + error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line); + warned = 1; + } + } + if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) ) + { + if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line); + warned = 1; + } + } tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); tr->id = trid; @@ -957,14 +980,26 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha int biotype = gff_parse_biotype(ss); if ( biotype <= 0 ) { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene: %s\n",line); + if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line); return; } aux_t *aux = &args->init; // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss); + uint32_t gene_id; + if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) ) + { + if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) ) + error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line); + warned = 1; + } + } + gf_gene_t *gene = gene_init(aux, gene_id); assert( !gene->name ); // the gene_id should be unique @@ -987,7 +1022,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha int gff_parse(args_t *args, char *line, ftr_t *ftr) { // - skip empty lines and commented lines - // - columns + // - columns // 1. chr // 2. // 3. CDS, transcript, gene, ... @@ -1012,11 +1047,14 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } else { + int type = GFF_UNKN_LINE; + if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE; + else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE; ss = gff_skip(line, ss); ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); ss = gff_skip(line, ss); - int type = gff_parse_type(ss); - if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) + if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene: + if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) { // we ignore these, debug print to see new types: ss = strstr(ss,"ID="); @@ -1057,7 +1095,18 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) ss += 2; // substring search for "Parent=transcript:ENST00000437963" - ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss); + if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) ) + { + if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line); + warned = 1; + } + } + ftr->iseq = feature_set_seq(args, chr_beg,chr_end); return 0; } @@ -1090,14 +1139,14 @@ void register_cds(args_t *args, ftr_t *ftr) tscript_t *tr = tscript_init(aux, ftr->trid); if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); - + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); cds->tr = tr; cds->beg = ftr->beg; cds->len = ftr->end - ftr->beg + 1; cds->icds = 0; // to keep valgrind on mac happy cds->phase = ftr->phase; - + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); tr->cds[tr->ncds++] = cds; } @@ -1186,7 +1235,7 @@ void tscript_init_cds(args_t *args) error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } - len += tr->cds[i]->len; + len += tr->cds[i]->len; } if ( !tscript_ok ) continue; // skip this transcript } @@ -1245,12 +1294,12 @@ void tscript_init_cds(args_t *args) for (i=0; incds; i++) { tr->cds[i]->icds = i; - len += tr->cds[i]->len; + len += tr->cds[i]->len; if ( !i ) continue; gf_cds_t *a = tr->cds[i-1]; gf_cds_t *b = tr->cds[i]; - if ( a->beg + a->len - 1 >= b->beg ) + if ( a->beg + a->len - 1 >= b->beg ) { if ( args->force ) { @@ -1259,7 +1308,7 @@ void tscript_init_cds(args_t *args) } else error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" - " Use the --force option to override (at your own risk).\n", + " Use the --force option to override (at your own risk).\n", args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); } } @@ -1360,7 +1409,7 @@ void init_gff(args_t *args) continue; } - // populate regidx by category: + // populate regidx by category: // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... if ( ftr->type==GF_CDS ) register_cds(args, ftr); @@ -1374,12 +1423,17 @@ void init_gff(args_t *args) if ( args->verbosity > 0 ) { - fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", regidx_nregs(args->idx_tscript), regidx_nregs(args->idx_exon), regidx_nregs(args->idx_cds), regidx_nregs(args->idx_utr)); } + if ( !regidx_nregs(args->idx_tscript) ) + fprintf(stderr, + "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" + " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" + " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); free(aux->ftr); khash_str2int_destroy_free(aux->seq2int); @@ -1437,7 +1491,7 @@ void init_data(args_t *args) if ( args->sample_list && !strcmp("-",args->sample_list) ) { // ignore all samples - if ( args->output_type==FT_TAB_TEXT ) + if ( args->output_type==FT_TAB_TEXT ) { // significant speedup for plain VCFs if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) @@ -1479,7 +1533,7 @@ void init_data(args_t *args) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); - if ( args->hdr_nsmpl ) + if ( args->hdr_nsmpl ) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); } @@ -1556,7 +1610,7 @@ void destroy_data(args_t *args) */ #define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely #define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region -#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed +#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed #define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq typedef struct { @@ -1567,16 +1621,16 @@ typedef struct bcf1_t *rec; } vcf; uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev) - check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon + check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon check_donor:1, // as with check_acceptor check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon - check_region_end:1, // + check_region_end:1, // check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr set_refalt:1; // set kref,kalt, if set, check also for synonymous events uint32_t csq; int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele - uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives + uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives ref_end; // a more conservative csq (the first and last base in kref.s) kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP } @@ -1615,7 +1669,7 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) #define XDBG 0 #if XDBG fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); -#endif +#endif splice->kref.l = 0; splice->kalt.l = 0; @@ -1703,7 +1757,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); tscript_t *tr = utr->tr; if ( tr->id != trid ) continue; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; @@ -1723,7 +1777,7 @@ static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, ui fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); #endif if ( !type ) return; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = type; @@ -1763,7 +1817,7 @@ fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); - if ( ret!=0 ) + if ( ret!=0 ) { regitr_destroy(itr); return SPLICE_OUTSIDE; // overlaps utr @@ -1910,7 +1964,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced } - else + else { // STRAND_FWD int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion @@ -2008,7 +2062,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% } } } - if ( splice->ref_end >= ex_beg ) + if ( splice->ref_end >= ex_beg ) { splice->tbeg = splice->ref_beg - splice->vcf.pos + 1; splice->ref_beg = ex_beg - 1; @@ -2058,7 +2112,7 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% } } } - if ( splice->ref_beg < ex_end ) + if ( splice->ref_beg < ex_end ) { splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); splice->ref_end = ex_end; @@ -2089,8 +2143,8 @@ fprintf(stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,% splice->vcf.rlen -= splice->tbeg + splice->tend; splice->vcf.alen -= splice->tbeg + splice->tend; } - splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); - splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf { splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; @@ -2137,7 +2191,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut } } } - if ( splice->ref_end >= ex_beg ) + if ( splice->ref_end >= ex_beg ) { splice->tbeg = splice->ref_beg - splice->vcf.pos; splice->ref_beg = ex_beg; @@ -2167,7 +2221,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut } } } - if ( splice->ref_beg <= ex_end ) + if ( splice->ref_beg <= ex_end ) { splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); splice->ref_end = ex_end; @@ -2194,8 +2248,8 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut if ( splice->set_refalt ) { splice->vcf.rlen -= splice->tbeg + splice->tend; - splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); - splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); } csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; @@ -2311,7 +2365,7 @@ fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n } assert( parent->type!=HAP_SSS ); - if ( parent->type==HAP_CDS ) + if ( parent->type==HAP_CDS ) { i = parent->icds; if ( i!=cds->icds ) @@ -2393,7 +2447,7 @@ void hap_destroy(hap_node_t *hap) /* ref: spliced reference and its length (ref.l) - seq: part of the spliced query transcript on the reference strand to translate, its + seq: part of the spliced query transcript on the reference strand to translate, its length (seq.l) and the total length of the complete transcript (seq.m) sbeg: seq offset within the spliced query transcript rbeg: seq offset within ref, 0-based @@ -2501,7 +2555,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, else // STRAND_REV { // right padding - number of bases to take from ref - npad = (seq.m - (sbeg + seq.l)) % 3; + npad = (seq.m - (sbeg + seq.l)) % 3; #if DBG>1 fprintf(stderr," npad: %d\n",npad); #endif @@ -2546,12 +2600,12 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, } if ( seq.s-codon==2 ) { - tmp[2] = seq.s[0]; + tmp[2] = seq.s[0]; i = 1; } else if ( seq.s-codon==1 ) { - tmp[1] = seq.s[0]; + tmp[1] = seq.s[0]; tmp[2] = seq.s[1]; i = 0; } @@ -2594,7 +2648,7 @@ fprintf(stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill, void tscript_splice_ref(tscript_t *tr) { int i, len = 0; - for (i=0; incds; i++) + for (i=0; incds; i++) len += tr->cds[i]->len; tr->nsref = len + 2*N_REF_PAD; @@ -2632,7 +2686,7 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); vrec_t *vrec = vbuf->vrec[i]; // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor - if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) + if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) csq->type.type &= ~CSQ_SPLICE_REGION; if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) @@ -2661,7 +2715,7 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); if ( csq->type.gene != vrec->vcsq[i].gene ) continue; if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue; if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop - if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) + if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) { // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered @@ -2669,14 +2723,14 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); // consequences: // stop_lost|AL627309.1|ENST00000423372|protein_coding|- // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA - if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) + if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) { if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) { vrec->vcsq[i].type |= csq->type.type; // remove stop_lost&synonymous if stop_retained set - if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) + if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; @@ -2686,7 +2740,7 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); } if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue; } - vrec->vcsq[i].type |= csq->type.type; + vrec->vcsq[i].type |= csq->type.type; goto exit_duplicate; } } @@ -2696,7 +2750,7 @@ fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); { if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue; if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; - if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) + if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) { vrec->vcsq[i].type |= csq->type.type; goto exit_duplicate; @@ -2799,7 +2853,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.biotype = tr->type; // only now we see the translated sequence and can determine if the stop/start changes are real - int rm_csq = 0; + int rm_csq = 0; csq->type.type = 0; for (i=ibeg; i<=iend; i++) csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND; @@ -2826,7 +2880,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } if ( csq->type.type & CSQ_STOP_LOST ) { - if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) + if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) { rm_csq |= CSQ_STOP_LOST; csq->type.type |= CSQ_STOP_RETAINED; @@ -2862,16 +2916,20 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } else { - for (i=0; itref.l; i++) - if ( hap->tref.s[i] != hap->tseq.s[i] ) break; - if ( i==hap->tref.l ) + int aa_change = 0; + for (i=0; itref.l; i++) + { + if ( hap->tref.s[i] == hap->tseq.s[i] ) continue; + aa_change = 1; + if ( hap->tref.s[i] == '*' ) + csq->type.type |= CSQ_STOP_LOST; + else if ( hap->tseq.s[i] == '*' ) + csq->type.type |= CSQ_STOP_GAINED; + else + csq->type.type |= CSQ_MISSENSE_VARIANT; + } + if ( !aa_change ) csq->type.type |= CSQ_SYNONYMOUS_VARIANT; - else if ( hap->tref.s[i] == '*' ) - csq->type.type |= CSQ_STOP_LOST; - else if ( hap->tseq.s[i] == '*' ) - csq->type.type |= CSQ_STOP_GAINED; - else - csq->type.type |= CSQ_MISSENSE_VARIANT; } } // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored @@ -3009,7 +3067,7 @@ void hap_finalize(args_t *args, hap_t *hap) // The spliced sequence has been built for the current haplotype and stored // in hap->sseq. Now we break it and output as independent parts - + kstring_t sseq; sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript hap->upstream_stop = 0; @@ -3267,7 +3325,7 @@ vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) // check for duplicate records i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1; - if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) + if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) { // vcf record with a new pos rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf); @@ -3333,7 +3391,7 @@ void vbuf_flush(args_t *args, uint32_t pos) vrec->line->pos = save_pos; // this is necessary for compound variants continue; } - + args->str.l = 0; kput_vcsq(args, &vrec->vcsq[0], &args->str); for (j=1; jnvcsq; j++) @@ -3411,7 +3469,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) int i = 0; while ( ref[i] && vcf[i] ) { - if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) + if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); i++; @@ -3456,7 +3514,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -3504,7 +3562,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) } if ( csq_type & CSQ_STOP_LOST ) { - if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) + if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) { csq_type &= ~CSQ_STOP_LOST; csq_type |= CSQ_STOP_RETAINED; @@ -3537,16 +3595,20 @@ int test_cds_local(args_t *args, bcf1_t *rec) } else { - for (j=0; jl; j++) - if ( tref->s[j] != tseq->s[j] ) break; - if ( j==tref->l ) + int aa_change = 0; + for (j=0; jl; j++) + { + if ( tref->s[j] == tseq->s[j] ) continue; + aa_change = 1; + if ( tref->s[j] == '*' ) + csq_type |= CSQ_STOP_LOST; + else if ( tseq->s[j] == '*' ) + csq_type |= CSQ_STOP_GAINED; + else + csq_type |= CSQ_MISSENSE_VARIANT; + } + if ( !aa_change ) csq_type |= CSQ_SYNONYMOUS_VARIANT; - else if ( tref->s[j] == '*' ) - csq_type |= CSQ_STOP_LOST; - else if ( tseq->s[j] == '*' ) - csq_type |= CSQ_STOP_GAINED; - else - csq_type |= CSQ_MISSENSE_VARIANT; } if ( csq_type & CSQ_COMPOUND ) { @@ -3576,7 +3638,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) tr->root->ncsq_list++; hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; - rm_csq->type.vstr = str; + rm_csq->type.vstr = str; } if ( csq_type & ~CSQ_COMPOUND ) { @@ -3644,7 +3706,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); overlaps_warned = 1; } - if ( args->out ) + if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); } else ret = 1; // prevent reporting as intron in test_tscript @@ -3653,7 +3715,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) } if ( child->type==HAP_SSS ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -3680,7 +3742,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) // apply the VCF variants and extend the haplotype tree int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); ngts /= bcf_hdr_nsamples(args->hdr); - if ( ngts!=1 && ngts!=2 ) + if ( ngts!=1 && ngts!=2 ) { if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) { @@ -3691,7 +3753,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); multiploid_warned = 1; } - if ( args->out ) + if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); continue; } @@ -3766,7 +3828,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) } if ( child->type==HAP_SSS ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -3890,7 +3952,7 @@ int test_utr(args_t *args, bcf1_t *rec) splice.csq = 0; int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; @@ -3958,7 +4020,7 @@ int test_tscript(args_t *args, bcf1_t *rec) splice.csq = 0; int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING; @@ -3996,7 +4058,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { while ( regitr_overlap(args->itr) ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); tscript_t *tr = cds->tr; @@ -4014,7 +4076,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { while ( regitr_overlap(args->itr) ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); tscript_t *tr = utr->tr; @@ -4054,7 +4116,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); splice.vcf.alt = rec->d.allele[1]; @@ -4143,7 +4205,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) return; } - if ( args->rid != rec->rid ) + if ( args->rid != rec->rid ) { hap_flush(args, REGIDX_MAX); vbuf_flush(args, REGIDX_MAX); @@ -4172,7 +4234,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) static const char *usage(void) { - return + return "\n" "About: Haplotype-aware consequence caller.\n" "Usage: bcftools csq [OPTIONS] in.vcf\n" @@ -4182,7 +4244,7 @@ static const char *usage(void) " -g, --gff-annot FILE GFF3 annotation file\n" "\n" "CSQ options:\n" - " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" + " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" " -l, --local-csq Localized predictions, consider only one VCF record at a time\n" " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" @@ -4261,13 +4323,13 @@ int main_csq(int argc, char *argv[]) {"no-version",no_argument,NULL,3}, {0,0,0,0} }; - int c, targets_is_file = 0, regions_is_file = 0; + int c, targets_is_file = 0, regions_is_file = 0; int regions_overlap = 1; int targets_overlap = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->force = 1; break; case 2 : @@ -4279,19 +4341,19 @@ int main_csq(int argc, char *argv[]) args->brief_predictions = 1; fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); break; - case 'B': + case 'B': args->brief_predictions = strtol(optarg,&tmp,10); if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); break; case 'l': args->local_csq = 1; break; case 'c': args->bcsq_tag = optarg; break; case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; - case 'v': + case 'v': args->verbosity = atoi(optarg); if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); break; case 'p': - switch (optarg[0]) + switch (optarg[0]) { case 'a': args->phase = PHASE_AS_IS; break; case 'm': args->phase = PHASE_MERGE; break; @@ -4303,7 +4365,7 @@ int main_csq(int argc, char *argv[]) break; case 'f': args->fa_fname = optarg; break; case 'g': args->gff_fname = optarg; break; - case 'n': + case 'n': args->ncsq2_max = 2 * atoi(optarg); if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg); break; diff --git a/bcftools/csq.c.pysam.c b/bcftools/csq.c.pysam.c index e4abf4b..8feb7af 100644 --- a/bcftools/csq.c.pysam.c +++ b/bcftools/csq.c.pysam.c @@ -2,20 +2,20 @@ /* The MIT License - Copyright (c) 2016-2021 Genome Research Ltd. + Copyright (c) 2016-2023 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -78,12 +78,12 @@ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/ C .. corresponding CDS, exon, and UTR lines: - C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ + C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the complete chain link C -> B -> A is required. For the rest, link B -> A suffices. - - + + The supported consequence types, sorted by impact: splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron) splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron) @@ -121,18 +121,18 @@ (based on biotype) which maps from transcript_id to a transcript. At the same time also build the hash "gid2gene" which maps from gene_id to gf_gene_t pointer. - + 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes. Use only features from "ftr" which are present in "id2tr". 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene. - + Data structures. idx_cds, idx_utr, idx_exon, idx_tscript: as described above, regidx structures for fast lookup of exons/transcripts overlapping a region, the payload is a pointer to tscript.cds */ - + #include #include #include @@ -165,9 +165,9 @@ #define FLT_EXCLUDE 2 // Definition of splice_region, splice_acceptor and splice_donor -#define N_SPLICE_DONOR 2 -#define N_SPLICE_REGION_EXON 3 -#define N_SPLICE_REGION_INTRON 8 +#define N_SPLICE_DONOR 2 +#define N_SPLICE_REGION_EXON 3 +#define N_SPLICE_REGION_INTRON 8 #define N_REF_PAD 10 // number of bases to avoid boundary effects @@ -188,7 +188,7 @@ // Node types in the haplotype tree #define HAP_CDS 0 -#define HAP_ROOT 1 +#define HAP_ROOT 1 #define HAP_SSS 2 // start/stop/splice #define CSQ_PRINTED_UPSTREAM (1<<0) @@ -228,25 +228,25 @@ #define CSQ_PRN_BIOTYPE CSQ_NON_CODING // see kput_vcsq() -const char *csq_strings[] = +const char *csq_strings[] = { - NULL, - "synonymous", - "missense", - "stop_lost", - "stop_gained", - "inframe_deletion", - "inframe_insertion", - "frameshift", - "splice_acceptor", - "splice_donor", - "start_lost", - "splice_region", - "stop_retained", - "5_prime_utr", - "3_prime_utr", - "non_coding", - "intron", + NULL, + "synonymous", + "missense", + "stop_lost", + "stop_gained", + "inframe_deletion", + "inframe_insertion", + "frameshift", + "splice_acceptor", + "splice_donor", + "start_lost", + "splice_region", + "stop_retained", + "5_prime_utr", + "3_prime_utr", + "non_coding", + "intron", "intergenic", "inframe_altering", NULL, @@ -258,11 +258,12 @@ const char *csq_strings[] = // GFF line types +#define GFF_UNKN_LINE 0 #define GFF_TSCRIPT_LINE 1 #define GFF_GENE_LINE 2 -/* +/* Genomic features, for fast lookup by position to overlapping features */ #define GF_coding_bit 6 @@ -507,9 +508,9 @@ hap_t; /* Helper structures, only for initialization - + ftr_t - temporary list of all exons, CDS, UTRs + temporary list of all exons, CDS, UTRs */ KHASH_MAP_INIT_INT(int2tscript, tscript_t*) KHASH_MAP_INIT_INT(int2gene, gf_gene_t*) @@ -598,7 +599,7 @@ typedef struct _args_t int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values) int ncsq2_small_warned; int brief_predictions; - + int rid; // current chromosome tr_heap_t *active_tr; // heap of active transcripts for quick flushing hap_t *hap; // transcript haplotype recursion @@ -646,13 +647,13 @@ const uint8_t cnt4[] = #define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ] #define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ] -static const char *gf_strings_noncoding[] = -{ +static const char *gf_strings_noncoding[] = +{ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript", "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping", - "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", - "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", - "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", + "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene", + "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene", + "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene", "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene", "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene", "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf" @@ -757,10 +758,11 @@ static void gff_id_destroy(id_tbl_t *tbl) khash_str2int_destroy_free(tbl->str2id); free(tbl->str); } -static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss) +// returns 0 on success, -1 on failure +static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr) { ss = strstr(ss,needle); // e.g. "ID=transcript:" - if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line); + if ( !ss ) return -1; ss += strlen(needle); char *se = ss; @@ -777,8 +779,8 @@ static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char khash_str2int_set(tbl->str2id, tbl->str[id], id); } *se = tmp; - - return id; + *id_ptr = id; + return 0; } static inline int gff_parse_type(char *line) { @@ -797,7 +799,7 @@ static inline int gff_parse_biotype(char *_line) line += 8; switch (*line) { - case 'p': + case 'p': if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING; else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE; else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT; @@ -861,7 +863,7 @@ static inline int gff_parse_biotype(char *_line) case 't': if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE; else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE; - else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; + else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE; else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE; else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE; else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE; @@ -933,13 +935,34 @@ void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr) int biotype = gff_parse_biotype(ss); if ( biotype <= 0 ) { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line); + if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript, unknown biotype: %s\n",line); return; } // create a mapping from transcript_id to gene_id - uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss); - uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss); + uint32_t trid, gene_id; + if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) ) + { + if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) ) + error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(bcftools_stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line); + warned = 1; + } + } + if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) ) + { + if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(bcftools_stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line); + warned = 1; + } + } tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t)); tr->id = trid; @@ -959,14 +982,26 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha int biotype = gff_parse_biotype(ss); if ( biotype <= 0 ) { - if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line); + if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene, unknown biotype: %s\n",line); return; } aux_t *aux = &args->init; // substring search for "ID=gene:ENSG00000437963" - uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss); + uint32_t gene_id; + if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) ) + { + if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) ) + error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(bcftools_stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line); + warned = 1; + } + } + gf_gene_t *gene = gene_init(aux, gene_id); assert( !gene->name ); // the gene_id should be unique @@ -989,7 +1024,7 @@ void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, cha int gff_parse(args_t *args, char *line, ftr_t *ftr) { // - skip empty lines and commented lines - // - columns + // - columns // 1. chr // 2. // 3. CDS, transcript, gene, ... @@ -1014,11 +1049,14 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; } else { + int type = GFF_UNKN_LINE; + if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE; + else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE; ss = gff_skip(line, ss); ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end); ss = gff_skip(line, ss); - int type = gff_parse_type(ss); - if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) + if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene: + if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE ) { // we ignore these, debug print to see new types: ss = strstr(ss,"ID="); @@ -1059,7 +1097,18 @@ int gff_parse(args_t *args, char *line, ftr_t *ftr) ss += 2; // substring search for "Parent=transcript:ENST00000437963" - ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss); + if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) ) + { + if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) ) + error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line); + static int warned = 0; + if ( !warned && args->verbosity > 0 ) + { + fprintf(bcftools_stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line); + warned = 1; + } + } + ftr->iseq = feature_set_seq(args, chr_beg,chr_end); return 0; } @@ -1092,14 +1141,14 @@ void register_cds(args_t *args, ftr_t *ftr) tscript_t *tr = tscript_init(aux, ftr->trid); if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand); - + gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t)); cds->tr = tr; cds->beg = ftr->beg; cds->len = ftr->end - ftr->beg + 1; cds->icds = 0; // to keep valgrind on mac happy cds->phase = ftr->phase; - + hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds); tr->cds[tr->ncds++] = cds; } @@ -1188,7 +1237,7 @@ void tscript_init_cds(args_t *args) error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n", args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len); } - len += tr->cds[i]->len; + len += tr->cds[i]->len; } if ( !tscript_ok ) continue; // skip this transcript } @@ -1247,12 +1296,12 @@ void tscript_init_cds(args_t *args) for (i=0; incds; i++) { tr->cds[i]->icds = i; - len += tr->cds[i]->len; + len += tr->cds[i]->len; if ( !i ) continue; gf_cds_t *a = tr->cds[i-1]; gf_cds_t *b = tr->cds[i]; - if ( a->beg + a->len - 1 >= b->beg ) + if ( a->beg + a->len - 1 >= b->beg ) { if ( args->force ) { @@ -1261,7 +1310,7 @@ void tscript_init_cds(args_t *args) } else error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n" - " Use the --force option to override (at your own risk).\n", + " Use the --force option to override (at your own risk).\n", args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len); } } @@ -1362,7 +1411,7 @@ void init_gff(args_t *args) continue; } - // populate regidx by category: + // populate regidx by category: // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5 // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ... if ( ftr->type==GF_CDS ) register_cds(args, ftr); @@ -1376,12 +1425,17 @@ void init_gff(args_t *args) if ( args->verbosity > 0 ) { - fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", + fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n", regidx_nregs(args->idx_tscript), regidx_nregs(args->idx_exon), regidx_nregs(args->idx_cds), regidx_nregs(args->idx_utr)); } + if ( !regidx_nregs(args->idx_tscript) ) + fprintf(bcftools_stderr, + "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n" + " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n" + " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n"); free(aux->ftr); khash_str2int_destroy_free(aux->seq2int); @@ -1439,7 +1493,7 @@ void init_data(args_t *args) if ( args->sample_list && !strcmp("-",args->sample_list) ) { // ignore all samples - if ( args->output_type==FT_TAB_TEXT ) + if ( args->output_type==FT_TAB_TEXT ) { // significant speedup for plain VCFs if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0) @@ -1481,7 +1535,7 @@ void init_data(args_t *args) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p); if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq"); bcf_hdr_printf(args->hdr,"##INFO=",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware"); - if ( args->hdr_nsmpl ) + if ( args->hdr_nsmpl ) bcf_hdr_printf(args->hdr,"##FORMAT=",args->bcsq_tag); if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output"); } @@ -1558,7 +1612,7 @@ void destroy_data(args_t *args) */ #define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely #define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region -#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed +#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed #define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq typedef struct { @@ -1569,16 +1623,16 @@ typedef struct bcf1_t *rec; } vcf; uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev) - check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon + check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon check_donor:1, // as with check_acceptor check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon - check_region_end:1, // + check_region_end:1, // check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr set_refalt:1; // set kref,kalt, if set, check also for synonymous events uint32_t csq; int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele - uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives + uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives ref_end; // a more conservative csq (the first and last base in kref.s) kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP } @@ -1617,7 +1671,7 @@ static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len) #define XDBG 0 #if XDBG fprintf(bcftools_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg); -#endif +#endif splice->kref.l = 0; splice->kalt.l = 0; @@ -1705,7 +1759,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32 gf_utr_t *utr = regitr_payload(itr, gf_utr_t*); tscript_t *tr = utr->tr; if ( tr->id != trid ) continue; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type; @@ -1725,7 +1779,7 @@ static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, ui fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type); #endif if ( !type ) return; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = type; @@ -1765,7 +1819,7 @@ fprintf(bcftools_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr { ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial); - if ( ret!=0 ) + if ( ret!=0 ) { regitr_destroy(itr); return SPLICE_OUTSIDE; // overlaps utr @@ -1912,7 +1966,7 @@ int shifted_del_synonymous(args_t *args, splice_t *splice, uint32_t ex_beg, uint while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++; if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced } - else + else { // STRAND_FWD int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion @@ -2010,7 +2064,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, } } } - if ( splice->ref_end >= ex_beg ) + if ( splice->ref_end >= ex_beg ) { splice->tbeg = splice->ref_beg - splice->vcf.pos + 1; splice->ref_beg = ex_beg - 1; @@ -2060,7 +2114,7 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, } } } - if ( splice->ref_beg < ex_end ) + if ( splice->ref_beg < ex_end ) { splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); splice->ref_end = ex_end; @@ -2091,8 +2145,8 @@ fprintf(bcftools_stderr,"splice_csq_del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg, splice->vcf.rlen -= splice->tbeg + splice->tend; splice->vcf.alen -= splice->tbeg + splice->tend; } - splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); - splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt); if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf { splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION; @@ -2139,7 +2193,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d } } } - if ( splice->ref_end >= ex_beg ) + if ( splice->ref_end >= ex_beg ) { splice->tbeg = splice->ref_beg - splice->vcf.pos; splice->ref_beg = ex_beg; @@ -2169,7 +2223,7 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d } } } - if ( splice->ref_beg <= ex_end ) + if ( splice->ref_beg <= ex_end ) { splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1); splice->ref_end = ex_end; @@ -2196,8 +2250,8 @@ fprintf(bcftools_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d if ( splice->set_refalt ) { splice->vcf.rlen -= splice->tbeg + splice->tend; - splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); - splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); + splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref); + splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt); } csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial); return SPLICE_INSIDE; @@ -2313,7 +2367,7 @@ fprintf(bcftools_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, c } assert( parent->type!=HAP_SSS ); - if ( parent->type==HAP_CDS ) + if ( parent->type==HAP_CDS ) { i = parent->icds; if ( i!=cds->icds ) @@ -2395,7 +2449,7 @@ void hap_destroy(hap_node_t *hap) /* ref: spliced reference and its length (ref.l) - seq: part of the spliced query transcript on the reference strand to translate, its + seq: part of the spliced query transcript on the reference strand to translate, its length (seq.l) and the total length of the complete transcript (seq.m) sbeg: seq offset within the spliced query transcript rbeg: seq offset within ref, 0-based @@ -2503,7 +2557,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r else // STRAND_REV { // right padding - number of bases to take from ref - npad = (seq.m - (sbeg + seq.l)) % 3; + npad = (seq.m - (sbeg + seq.l)) % 3; #if DBG>1 fprintf(bcftools_stderr," npad: %d\n",npad); #endif @@ -2548,12 +2602,12 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r } if ( seq.s-codon==2 ) { - tmp[2] = seq.s[0]; + tmp[2] = seq.s[0]; i = 1; } else if ( seq.s-codon==1 ) { - tmp[1] = seq.s[0]; + tmp[1] = seq.s[0]; tmp[2] = seq.s[1]; i = 0; } @@ -2596,7 +2650,7 @@ fprintf(bcftools_stderr,"\ntranslate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,r void tscript_splice_ref(tscript_t *tr) { int i, len = 0; - for (i=0; incds; i++) + for (i=0; incds; i++) len += tr->cds[i]->len; tr->nsref = len + 2*N_REF_PAD; @@ -2634,7 +2688,7 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); vrec_t *vrec = vbuf->vrec[i]; // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor - if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) + if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) ) csq->type.type &= ~CSQ_SPLICE_REGION; if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) @@ -2663,7 +2717,7 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); if ( csq->type.gene != vrec->vcsq[i].gene ) continue; if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue; if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop - if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) + if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s ) { // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered @@ -2671,14 +2725,14 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); // consequences: // stop_lost|AL627309.1|ENST00000423372|protein_coding|- // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA - if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) + if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s ) { if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP ) { vrec->vcsq[i].type |= csq->type.type; // remove stop_lost&synonymous if stop_retained set - if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) + if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED ) vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT); if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr; @@ -2688,7 +2742,7 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); } if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue; } - vrec->vcsq[i].type |= csq->type.type; + vrec->vcsq[i].type |= csq->type.type; goto exit_duplicate; } } @@ -2698,7 +2752,7 @@ fprintf(bcftools_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type); { if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue; if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue; - if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) + if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) ) { vrec->vcsq[i].type |= csq->type.type; goto exit_duplicate; @@ -2801,7 +2855,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, csq->type.biotype = tr->type; // only now we see the translated sequence and can determine if the stop/start changes are real - int rm_csq = 0; + int rm_csq = 0; csq->type.type = 0; for (i=ibeg; i<=iend; i++) csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND; @@ -2828,7 +2882,7 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } if ( csq->type.type & CSQ_STOP_LOST ) { - if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) + if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] ) { rm_csq |= CSQ_STOP_LOST; csq->type.type |= CSQ_STOP_RETAINED; @@ -2864,16 +2918,20 @@ void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, } else { - for (i=0; itref.l; i++) - if ( hap->tref.s[i] != hap->tseq.s[i] ) break; - if ( i==hap->tref.l ) + int aa_change = 0; + for (i=0; itref.l; i++) + { + if ( hap->tref.s[i] == hap->tseq.s[i] ) continue; + aa_change = 1; + if ( hap->tref.s[i] == '*' ) + csq->type.type |= CSQ_STOP_LOST; + else if ( hap->tseq.s[i] == '*' ) + csq->type.type |= CSQ_STOP_GAINED; + else + csq->type.type |= CSQ_MISSENSE_VARIANT; + } + if ( !aa_change ) csq->type.type |= CSQ_SYNONYMOUS_VARIANT; - else if ( hap->tref.s[i] == '*' ) - csq->type.type |= CSQ_STOP_LOST; - else if ( hap->tseq.s[i] == '*' ) - csq->type.type |= CSQ_STOP_GAINED; - else - csq->type.type |= CSQ_MISSENSE_VARIANT; } } // Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored @@ -3011,7 +3069,7 @@ void hap_finalize(args_t *args, hap_t *hap) // The spliced sequence has been built for the current haplotype and stored // in hap->sseq. Now we break it and output as independent parts - + kstring_t sseq; sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript hap->upstream_stop = 0; @@ -3269,7 +3327,7 @@ vbuf_t *vbuf_push(args_t *args, bcf1_t **rec_ptr) // check for duplicate records i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1; - if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) + if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos ) { // vcf record with a new pos rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf); @@ -3335,7 +3393,7 @@ void vbuf_flush(args_t *args, uint32_t pos) vrec->line->pos = save_pos; // this is necessary for compound variants continue; } - + args->str.l = 0; kput_vcsq(args, &vrec->vcsq[0], &args->str); for (j=1; jnvcsq; j++) @@ -3413,7 +3471,7 @@ static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec) int i = 0; while ( ref[i] && vcf[i] ) { - if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) + if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) ) error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]); i++; @@ -3458,7 +3516,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; } if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -3506,7 +3564,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) } if ( csq_type & CSQ_STOP_LOST ) { - if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) + if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] ) { csq_type &= ~CSQ_STOP_LOST; csq_type |= CSQ_STOP_RETAINED; @@ -3539,16 +3597,20 @@ int test_cds_local(args_t *args, bcf1_t *rec) } else { - for (j=0; jl; j++) - if ( tref->s[j] != tseq->s[j] ) break; - if ( j==tref->l ) + int aa_change = 0; + for (j=0; jl; j++) + { + if ( tref->s[j] == tseq->s[j] ) continue; + aa_change = 1; + if ( tref->s[j] == '*' ) + csq_type |= CSQ_STOP_LOST; + else if ( tseq->s[j] == '*' ) + csq_type |= CSQ_STOP_GAINED; + else + csq_type |= CSQ_MISSENSE_VARIANT; + } + if ( !aa_change ) csq_type |= CSQ_SYNONYMOUS_VARIANT; - else if ( tref->s[j] == '*' ) - csq_type |= CSQ_STOP_LOST; - else if ( tseq->s[j] == '*' ) - csq_type |= CSQ_STOP_GAINED; - else - csq_type |= CSQ_MISSENSE_VARIANT; } if ( csq_type & CSQ_COMPOUND ) { @@ -3578,7 +3640,7 @@ int test_cds_local(args_t *args, bcf1_t *rec) tr->root->ncsq_list++; hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list); csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1; - rm_csq->type.vstr = str; + rm_csq->type.vstr = str; } if ( csq_type & ~CSQ_COMPOUND ) { @@ -3646,7 +3708,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); overlaps_warned = 1; } - if ( args->out ) + if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); } else ret = 1; // prevent reporting as intron in test_tscript @@ -3655,7 +3717,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) } if ( child->type==HAP_SSS ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -3682,7 +3744,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) // apply the VCF variants and extend the haplotype tree int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr); ngts /= bcf_hdr_nsamples(args->hdr); - if ( ngts!=1 && ngts!=2 ) + if ( ngts!=1 && ngts!=2 ) { if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) ) { @@ -3693,7 +3755,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n"); multiploid_warned = 1; } - if ( args->out ) + if ( args->out ) fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]); continue; } @@ -3768,7 +3830,7 @@ int test_cds(args_t *args, bcf1_t *rec, vbuf_t *vbuf) } if ( child->type==HAP_SSS ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.biotype = tr->type; @@ -3892,7 +3954,7 @@ int test_utr(args_t *args, bcf1_t *rec) splice.csq = 0; int splice_ret = splice_csq(args, &splice, utr->beg, utr->end); if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3; @@ -3960,7 +4022,7 @@ int test_tscript(args_t *args, bcf1_t *rec) splice.csq = 0; int splice_ret = splice_csq(args, &splice, tr->beg, tr->end); if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); csq.pos = rec->pos; csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING; @@ -3998,7 +4060,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { while ( regitr_overlap(args->itr) ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*); tscript_t *tr = cds->tr; @@ -4016,7 +4078,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) { while ( regitr_overlap(args->itr) ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*); tscript_t *tr = utr->tr; @@ -4056,7 +4118,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec) while ( regitr_overlap(args->itr) ) { - csq_t csq; + csq_t csq; memset(&csq, 0, sizeof(csq_t)); tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*); splice.vcf.alt = rec->d.allele[1]; @@ -4145,7 +4207,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) return; } - if ( args->rid != rec->rid ) + if ( args->rid != rec->rid ) { hap_flush(args, REGIDX_MAX); vbuf_flush(args, REGIDX_MAX); @@ -4174,7 +4236,7 @@ static void process(args_t *args, bcf1_t **rec_ptr) static const char *usage(void) { - return + return "\n" "About: Haplotype-aware consequence caller.\n" "Usage: bcftools csq [OPTIONS] in.vcf\n" @@ -4184,7 +4246,7 @@ static const char *usage(void) " -g, --gff-annot FILE GFF3 annotation file\n" "\n" "CSQ options:\n" - " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" + " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n" " -c, --custom-tag STRING Use this tag instead of the default BCSQ\n" " -l, --local-csq Localized predictions, consider only one VCF record at a time\n" " -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n" @@ -4263,13 +4325,13 @@ int main_csq(int argc, char *argv[]) {"no-version",no_argument,NULL,3}, {0,0,0,0} }; - int c, targets_is_file = 0, regions_is_file = 0; + int c, targets_is_file = 0, regions_is_file = 0; int regions_overlap = 1; int targets_overlap = 0; char *targets_list = NULL, *regions_list = NULL, *tmp; while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0) { - switch (c) + switch (c) { case 1 : args->force = 1; break; case 2 : @@ -4281,19 +4343,19 @@ int main_csq(int argc, char *argv[]) args->brief_predictions = 1; fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n"); break; - case 'B': + case 'B': args->brief_predictions = strtol(optarg,&tmp,10); if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg); break; case 'l': args->local_csq = 1; break; case 'c': args->bcsq_tag = optarg; break; case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break; - case 'v': + case 'v': args->verbosity = atoi(optarg); if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n"); break; case 'p': - switch (optarg[0]) + switch (optarg[0]) { case 'a': args->phase = PHASE_AS_IS; break; case 'm': args->phase = PHASE_MERGE; break; @@ -4305,7 +4367,7 @@ int main_csq(int argc, char *argv[]) break; case 'f': args->fa_fname = optarg; break; case 'g': args->gff_fname = optarg; break; - case 'n': + case 'n': args->ncsq2_max = 2 * atoi(optarg); if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg); break; diff --git a/bcftools/filter.c b/bcftools/filter.c index d545608..3925475 100644 --- a/bcftools/filter.c +++ b/bcftools/filter.c @@ -1,6 +1,6 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -69,6 +69,7 @@ typedef struct _token_t int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) + // -3: select indices on the fly based on values in GT int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only int nidxs, nuidxs; // size of idxs array and the number of elements set to 1 uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise @@ -100,9 +101,17 @@ struct _filter_t float *tmpf; kstring_t tmps; int max_unpack, mtmpi, mtmpf, nsamples; + struct { + bcf1_t *line; + int32_t *buf, nbuf, mbuf; // GTs as obtained by bcf_get_genotypes() + uint64_t *mask; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles + } cached_GT; #if ENABLE_PERL_FILTERS PerlInterpreter *perl; #endif + char **undef_tag; + int nundef_tag; + int status, exit_on_error; }; @@ -298,6 +307,28 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } +#define FILTER_OK 0 +#define FILTER_ERR_UNKN_TAGS 1 +#define FILTER_ERR_OTHER 2 + +static void filter_add_undef_tag(filter_t *filter, char *str) +{ + int i; + for (i=0; inundef_tag; i++) + if ( !strcmp(str,filter->undef_tag[i]) ) break; + if ( inundef_tag ) return; + filter->nundef_tag++; + filter->undef_tag = (char**)realloc(filter->undef_tag,sizeof(*filter->undef_tag)*filter->nundef_tag); + if ( !filter->undef_tag ) error("Could not allocate memory\n"); + filter->undef_tag[filter->nundef_tag-1] = strdup(str); + if ( !filter->undef_tag[filter->nundef_tag-1] ) error("Could not allocate memory\n"); +} +const char **filter_list_undef_tags(filter_t *filter, int *ntags) +{ + *ntags = filter->nundef_tag; + return (const char**)filter->undef_tag; +} + /* Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. @@ -350,6 +381,44 @@ char *expand_path(char *path) return strdup(path); } +static int filters_cache_genotypes(filter_t *flt, bcf1_t *line) +{ + if ( flt->cached_GT.line==line ) return flt->cached_GT.nbuf > 0 ? 0 : -1; + flt->cached_GT.line = line; + flt->cached_GT.nbuf = bcf_get_genotypes(flt->hdr, line, &flt->cached_GT.buf, &flt->cached_GT.mbuf); + if ( flt->cached_GT.nbuf<=0 ) return -1; + if ( !flt->cached_GT.mask ) + { + flt->cached_GT.mask = (uint64_t*) malloc(sizeof(*flt->cached_GT.mask)*flt->nsamples); + if ( !flt->cached_GT.mask ) error("Could not alloc %zu bytes\n",sizeof(*flt->cached_GT.mask)*flt->nsamples); + } + int i,j, ngt1 = flt->cached_GT.nbuf / line->n_sample; + for (i=0; in_sample; i++) + { + int32_t *ptr = flt->cached_GT.buf + i*ngt1; + flt->cached_GT.mask[i] = 0; + for (j=0; j 63 ) + { + static int warned = 0; + if ( !warned ) + { + fprintf(stderr,"Too many alleles, skipping GT filtering at this site %s:%"PRId64". " + "(This warning is printed only once.)\n", bcf_seqname(flt->hdr,line),line->pos+1); + warned = 1; + } + flt->cached_GT.nbuf = 0; + return -1; + } + flt->cached_GT.mask[i] |= 1<values[i] = ptr[tok->idx]; } } + else if ( tok->idx==-3 ) + { + if ( filters_cache_genotypes(flt,line)!=0 ) + { + tok->nvalues = 0; + return; + } + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + int32_t *src = flt->tmpi + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; kcached_GT.mask[i] & (1<nval1; j++) bcf_double_set_vector_end(dst[j]); + } + } else { int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; @@ -825,6 +915,33 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) tok->values[i] = ptr[tok->idx]; } } + else if ( tok->idx==-3 ) + { + if ( filters_cache_genotypes(flt,line)!=0 ) + { + tok->nvalues = 0; + return; + } + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + float *src = flt->tmpf + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; kcached_GT.mask[i] & (1<nval1; j++) bcf_double_set_vector_end(dst[j]); + } + } else { int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; @@ -989,9 +1106,9 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int tok->str_value.s[tok->str_value.l] = 0; tok->nval1 = nvals1; } -static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } -static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } -static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } +static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } // rr, ra, aa, aA etc +static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } // hap, hom, het +static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } // mis, alt, ref static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -1974,18 +2091,47 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) { \ tok_init_values(atok, btok, rtok); \ tok_init_samples(atok, btok, rtok); \ - if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \ + if ( !atok->nsamples && !btok->nsamples ) \ { \ - assert( atok->nsamples==btok->nsamples ); \ - for (i=0; invalues; i++) \ + if ( atok->nvalues!=btok->nvalues && atok->nvalues!=1 && btok->nvalues!=1 ) \ + error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nvalues,btok->nvalues); \ + int ir,ia = 0, ib = 0; \ + for (ir=0; irnvalues; ir++) \ { \ - if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ + if ( atok->nvalues > 1 ) ia = ir; \ + if ( btok->nvalues > 1 ) ib = ir; \ + if ( bcf_double_is_missing_or_vector_end(atok->values[ia]) || bcf_double_is_missing_or_vector_end(btok->values[ib]) ) \ { \ - bcf_double_set_missing(rtok->values[i]); \ + bcf_double_set_missing(rtok->values[ir]); \ continue; \ } \ has_values = 1; \ - rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \ + rtok->values[ir] = TYPE atok->values[ia] AOP TYPE btok->values[ib]; \ + } \ + } \ + else if ( atok->nsamples && btok->nsamples ) \ + { \ + assert( atok->nsamples==btok->nsamples ); \ + if ( atok->nval1!=btok->nval1 && atok->nval1!=1 && btok->nval1!=1 ) \ + error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nval1,btok->nval1); \ + for (i=0; insamples; i++) \ + { \ + double *rval = rtok->values + i*rtok->nval1; \ + double *aval = atok->values + i*atok->nval1; \ + double *bval = btok->values + i*btok->nval1; \ + int ir,ia = 0, ib = 0; \ + for (ir=0; irnval1; ir++) \ + { \ + if ( atok->nval1 > 1 ) ia = ir; \ + if ( btok->nval1 > 1 ) ib = ir; \ + if ( bcf_double_is_missing_or_vector_end(aval[ia]) || bcf_double_is_missing_or_vector_end(bval[ib]) ) \ + { \ + bcf_double_set_missing(rval[ir]); \ + continue; \ + } \ + has_values = 1; \ + rval[ir] = TYPE aval[ia] AOP TYPE bval[ib]; \ + } \ } \ } \ else if ( atok->nsamples ) \ @@ -2451,6 +2597,14 @@ static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx) *idx = -2; return 0; } + if ( !strcmp("GT", tag_idx) ) + { + *idxs = (int*) malloc(sizeof(int)); + (*idxs)[0] = -1; + *nidxs = 1; + *idx = -3; + return 0; + } // TAG[integer] .. one field; idx positive char *end, *beg = tag_idx; @@ -2566,7 +2720,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, tok->idxs = (int*) malloc(sizeof(int)); tok->idxs[0] = -1; tok->nidxs = 1; - tok->idx = -2; + tok->idx = idx1; } else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 ) error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag); @@ -2591,7 +2745,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); tok->usmpl[idx1] = 1; } - else if ( idx1==-2 ) + else if ( idx1==-2 || idx1==-3 ) { for (i=0; ihdr, is_fmt, tmp.s, tmp.s+is_array, tok); + if ( tok->idx==-3 && bcf_hdr_id2length(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=BCF_VL_R ) + error("Error: GT subscripts can be used only with Number=R tags\n"); + } else if ( is_fmt && !tok->nsamples ) { int i; @@ -2930,7 +3088,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { errno = 0; tok->threshold = strtod(tmp.s, &end); // float? - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + if ( errno!=0 || end!=tmp.s+len ) + { + if ( filter->exit_on_error ) + error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + filter->status |= FILTER_ERR_UNKN_TAGS; + filter_add_undef_tag(filter,tmp.s); + } } tok->is_constant = 1; @@ -2938,7 +3102,6 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) return 0; } - static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) { int i; @@ -3088,12 +3251,13 @@ static void perl_destroy(filter_t *filter) // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm -filter_t *filter_init(bcf_hdr_t *hdr, const char *str) +static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error) { filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t)); filter->str = strdup(str); filter->hdr = hdr; filter->max_unpack |= BCF_UN_STR; + filter->exit_on_error = exit_on_error; int nops = 0, mops = 0; // operators stack int nout = 0, mout = 0; // filter tokens, RPN @@ -3475,6 +3639,14 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout); return filter; } +filter_t *filter_parse(bcf_hdr_t *hdr, const char *str) +{ + return filter_init_(hdr, str, 0); +} +filter_t *filter_init(bcf_hdr_t *hdr, const char *str) +{ + return filter_init_(hdr, str, 1); +} void filter_destroy(filter_t *filter) { @@ -3496,6 +3668,10 @@ void filter_destroy(filter_t *filter) free(filter->filters[i].regex); } } + for (i=0; inundef_tag; i++) free(filter->undef_tag[i]); + free(filter->undef_tag); + free(filter->cached_GT.buf); + free(filter->cached_GT.mask); free(filter->filters); free(filter->flt_stack); free(filter->str); @@ -3507,6 +3683,7 @@ void filter_destroy(filter_t *filter) int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) { + if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n"); bcf_unpack(line, filter->max_unpack); int i, nstack = 0; @@ -3669,3 +3846,8 @@ void filter_set_samples(filter_t *filter, const uint8_t *samples) } } +int filter_status(filter_t *filter) +{ + return filter->status; +} + diff --git a/bcftools/filter.c.pysam.c b/bcftools/filter.c.pysam.c index d15586c..8e2d1d1 100644 --- a/bcftools/filter.c.pysam.c +++ b/bcftools/filter.c.pysam.c @@ -2,7 +2,7 @@ /* filter.c -- filter expressions. - Copyright (C) 2013-2022 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -71,6 +71,7 @@ typedef struct _token_t int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types int idx; // 0-based index to VCF vectors, // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..]) + // -3: select indices on the fly based on values in GT int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only int nidxs, nuidxs; // size of idxs array and the number of elements set to 1 uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise @@ -102,9 +103,17 @@ struct _filter_t float *tmpf; kstring_t tmps; int max_unpack, mtmpi, mtmpf, nsamples; + struct { + bcf1_t *line; + int32_t *buf, nbuf, mbuf; // GTs as obtained by bcf_get_genotypes() + uint64_t *mask; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles + } cached_GT; #if ENABLE_PERL_FILTERS PerlInterpreter *perl; #endif + char **undef_tag; + int nundef_tag; + int status, exit_on_error; }; @@ -300,6 +309,28 @@ static int filters_next_token(char **str, int *len) return TOK_VAL; } +#define FILTER_OK 0 +#define FILTER_ERR_UNKN_TAGS 1 +#define FILTER_ERR_OTHER 2 + +static void filter_add_undef_tag(filter_t *filter, char *str) +{ + int i; + for (i=0; inundef_tag; i++) + if ( !strcmp(str,filter->undef_tag[i]) ) break; + if ( inundef_tag ) return; + filter->nundef_tag++; + filter->undef_tag = (char**)realloc(filter->undef_tag,sizeof(*filter->undef_tag)*filter->nundef_tag); + if ( !filter->undef_tag ) error("Could not allocate memory\n"); + filter->undef_tag[filter->nundef_tag-1] = strdup(str); + if ( !filter->undef_tag[filter->nundef_tag-1] ) error("Could not allocate memory\n"); +} +const char **filter_list_undef_tags(filter_t *filter, int *ntags) +{ + *ntags = filter->nundef_tag; + return (const char**)filter->undef_tag; +} + /* Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller. @@ -352,6 +383,44 @@ char *expand_path(char *path) return strdup(path); } +static int filters_cache_genotypes(filter_t *flt, bcf1_t *line) +{ + if ( flt->cached_GT.line==line ) return flt->cached_GT.nbuf > 0 ? 0 : -1; + flt->cached_GT.line = line; + flt->cached_GT.nbuf = bcf_get_genotypes(flt->hdr, line, &flt->cached_GT.buf, &flt->cached_GT.mbuf); + if ( flt->cached_GT.nbuf<=0 ) return -1; + if ( !flt->cached_GT.mask ) + { + flt->cached_GT.mask = (uint64_t*) malloc(sizeof(*flt->cached_GT.mask)*flt->nsamples); + if ( !flt->cached_GT.mask ) error("Could not alloc %zu bytes\n",sizeof(*flt->cached_GT.mask)*flt->nsamples); + } + int i,j, ngt1 = flt->cached_GT.nbuf / line->n_sample; + for (i=0; in_sample; i++) + { + int32_t *ptr = flt->cached_GT.buf + i*ngt1; + flt->cached_GT.mask[i] = 0; + for (j=0; j 63 ) + { + static int warned = 0; + if ( !warned ) + { + fprintf(bcftools_stderr,"Too many alleles, skipping GT filtering at this site %s:%"PRId64". " + "(This warning is printed only once.)\n", bcf_seqname(flt->hdr,line),line->pos+1); + warned = 1; + } + flt->cached_GT.nbuf = 0; + return -1; + } + flt->cached_GT.mask[i] |= 1<values[i] = ptr[tok->idx]; } } + else if ( tok->idx==-3 ) + { + if ( filters_cache_genotypes(flt,line)!=0 ) + { + tok->nvalues = 0; + return; + } + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + int32_t *src = flt->tmpi + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; kcached_GT.mask[i] & (1<nval1; j++) bcf_double_set_vector_end(dst[j]); + } + } else { int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; @@ -827,6 +917,33 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok) tok->values[i] = ptr[tok->idx]; } } + else if ( tok->idx==-3 ) + { + if ( filters_cache_genotypes(flt,line)!=0 ) + { + tok->nvalues = 0; + return; + } + for (i=0; insamples; i++) + { + if ( !tok->usmpl[i] ) continue; + float *src = flt->tmpf + i*nsrc1; + double *dst = tok->values + i*tok->nval1; + int k, j = 0; + for (k=0; kcached_GT.mask[i] & (1<nval1; j++) bcf_double_set_vector_end(dst[j]); + } + } else { int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs; @@ -991,9 +1108,9 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int tok->str_value.s[tok->str_value.l] = 0; tok->nval1 = nvals1; } -static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } -static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } -static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } +static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } // rr, ra, aa, aA etc +static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } // hap, hom, het +static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } // mis, alt, ref static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok) { @@ -1976,18 +2093,47 @@ inline static void tok_init_samples(token_t *atok, token_t *btok, token_t *rtok) { \ tok_init_values(atok, btok, rtok); \ tok_init_samples(atok, btok, rtok); \ - if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \ + if ( !atok->nsamples && !btok->nsamples ) \ { \ - assert( atok->nsamples==btok->nsamples ); \ - for (i=0; invalues; i++) \ + if ( atok->nvalues!=btok->nvalues && atok->nvalues!=1 && btok->nvalues!=1 ) \ + error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nvalues,btok->nvalues); \ + int ir,ia = 0, ib = 0; \ + for (ir=0; irnvalues; ir++) \ { \ - if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \ + if ( atok->nvalues > 1 ) ia = ir; \ + if ( btok->nvalues > 1 ) ib = ir; \ + if ( bcf_double_is_missing_or_vector_end(atok->values[ia]) || bcf_double_is_missing_or_vector_end(btok->values[ib]) ) \ { \ - bcf_double_set_missing(rtok->values[i]); \ + bcf_double_set_missing(rtok->values[ir]); \ continue; \ } \ has_values = 1; \ - rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \ + rtok->values[ir] = TYPE atok->values[ia] AOP TYPE btok->values[ib]; \ + } \ + } \ + else if ( atok->nsamples && btok->nsamples ) \ + { \ + assert( atok->nsamples==btok->nsamples ); \ + if ( atok->nval1!=btok->nval1 && atok->nval1!=1 && btok->nval1!=1 ) \ + error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nval1,btok->nval1); \ + for (i=0; insamples; i++) \ + { \ + double *rval = rtok->values + i*rtok->nval1; \ + double *aval = atok->values + i*atok->nval1; \ + double *bval = btok->values + i*btok->nval1; \ + int ir,ia = 0, ib = 0; \ + for (ir=0; irnval1; ir++) \ + { \ + if ( atok->nval1 > 1 ) ia = ir; \ + if ( btok->nval1 > 1 ) ib = ir; \ + if ( bcf_double_is_missing_or_vector_end(aval[ia]) || bcf_double_is_missing_or_vector_end(bval[ib]) ) \ + { \ + bcf_double_set_missing(rval[ir]); \ + continue; \ + } \ + has_values = 1; \ + rval[ir] = TYPE aval[ia] AOP TYPE bval[ib]; \ + } \ } \ } \ else if ( atok->nsamples ) \ @@ -2453,6 +2599,14 @@ static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx) *idx = -2; return 0; } + if ( !strcmp("GT", tag_idx) ) + { + *idxs = (int*) malloc(sizeof(int)); + (*idxs)[0] = -1; + *nidxs = 1; + *idx = -3; + return 0; + } // TAG[integer] .. one field; idx positive char *end, *beg = tag_idx; @@ -2568,7 +2722,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, tok->idxs = (int*) malloc(sizeof(int)); tok->idxs[0] = -1; tok->nidxs = 1; - tok->idx = -2; + tok->idx = idx1; } else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 ) error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag); @@ -2593,7 +2747,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx, if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori); tok->usmpl[idx1] = 1; } - else if ( idx1==-2 ) + else if ( idx1==-2 || idx1==-3 ) { for (i=0; ihdr, is_fmt, tmp.s, tmp.s+is_array, tok); + if ( tok->idx==-3 && bcf_hdr_id2length(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=BCF_VL_R ) + error("Error: GT subscripts can be used only with Number=R tags\n"); + } else if ( is_fmt && !tok->nsamples ) { int i; @@ -2932,7 +3090,13 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { errno = 0; tok->threshold = strtod(tmp.s, &end); // float? - if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + if ( errno!=0 || end!=tmp.s+len ) + { + if ( filter->exit_on_error ) + error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); + filter->status |= FILTER_ERR_UNKN_TAGS; + filter_add_undef_tag(filter,tmp.s); + } } tok->is_constant = 1; @@ -2940,7 +3104,6 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) return 0; } - static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks) { int i; @@ -3090,12 +3253,13 @@ static void perl_destroy(filter_t *filter) // Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm -filter_t *filter_init(bcf_hdr_t *hdr, const char *str) +static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error) { filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t)); filter->str = strdup(str); filter->hdr = hdr; filter->max_unpack |= BCF_UN_STR; + filter->exit_on_error = exit_on_error; int nops = 0, mops = 0; // operators stack int nout = 0, mout = 0; // filter tokens, RPN @@ -3477,6 +3641,14 @@ filter_t *filter_init(bcf_hdr_t *hdr, const char *str) filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout); return filter; } +filter_t *filter_parse(bcf_hdr_t *hdr, const char *str) +{ + return filter_init_(hdr, str, 0); +} +filter_t *filter_init(bcf_hdr_t *hdr, const char *str) +{ + return filter_init_(hdr, str, 1); +} void filter_destroy(filter_t *filter) { @@ -3498,6 +3670,10 @@ void filter_destroy(filter_t *filter) free(filter->filters[i].regex); } } + for (i=0; inundef_tag; i++) free(filter->undef_tag[i]); + free(filter->undef_tag); + free(filter->cached_GT.buf); + free(filter->cached_GT.mask); free(filter->filters); free(filter->flt_stack); free(filter->str); @@ -3509,6 +3685,7 @@ void filter_destroy(filter_t *filter) int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples) { + if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n"); bcf_unpack(line, filter->max_unpack); int i, nstack = 0; @@ -3671,3 +3848,8 @@ void filter_set_samples(filter_t *filter, const uint8_t *samples) } } +int filter_status(filter_t *filter) +{ + return filter->status; +} + diff --git a/bcftools/filter.h b/bcftools/filter.h index 243e3b6..7be842a 100644 --- a/bcftools/filter.h +++ b/bcftools/filter.h @@ -1,6 +1,6 @@ /* filter.h -- filter expressions. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2023 Genome Research Ltd. Author: Petr Danecek @@ -32,6 +32,8 @@ typedef struct _filter_t filter_t; /** * @hdr: BCF header file * @str: see the bcftools filter command help for description + * + * Same as filter_parse() but exits on errors */ filter_t *filter_init(bcf_hdr_t *hdr, const char *str); @@ -61,4 +63,21 @@ const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1); void filter_expression_info(FILE *fp); int filter_max_unpack(filter_t *filter); +/** + * Same as filter_init() but may not exit on some type of errors. The caller + * must check if the returned value is not NULL and if the consequent call + * of filter_status() returns FILTER_OK before the filter_pass() can be called. + */ +filter_t *filter_parse(bcf_hdr_t *hdr, const char *str); + +#define FILTER_OK 0 +#define FILTER_ERR_UNKN_TAGS 1 +#define FILTER_ERR_OTHER 2 + +/** + * Check if filter_parse() was successful + */ +int filter_status(filter_t *filter); +const char **filter_list_undef_tags(filter_t *filter, int *nundef); + #endif diff --git a/bcftools/main.c b/bcftools/main.c index 3a0d557..a021358 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -265,7 +265,7 @@ int main(int argc, char *argv[]) if (argc < 2) { usage(stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version()); + printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2023 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL printf("License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/main.c.pysam.c b/bcftools/main.c.pysam.c index 535813c..7608adc 100644 --- a/bcftools/main.c.pysam.c +++ b/bcftools/main.c.pysam.c @@ -267,7 +267,7 @@ int bcftools_main(int argc, char *argv[]) if (argc < 2) { usage(bcftools_stderr); return 1; } if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) { - fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version()); + fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2023 Genome Research Ltd.\n", bcftools_version(), hts_version()); #if USE_GPL fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later \n"); #else diff --git a/bcftools/mcall.c b/bcftools/mcall.c index 5761896..804ff01 100644 --- a/bcftools/mcall.c +++ b/bcftools/mcall.c @@ -1,6 +1,6 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -314,7 +314,7 @@ static void init_sample_groups(call_t *call) while ( *ptr && isspace(*ptr) ) ptr++; if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]); *tmp = 0; - int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); + int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); if ( ismpl<0 ) continue; if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); if ( !khash_str2int_has_key(grp2idx,ptr+1) ) @@ -336,7 +336,7 @@ static void init_sample_groups(call_t *call) { if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups); int igrp = smpl2grp[i] - 1; - if ( !call->smpl_grp[igrp].nsmpl ) + if ( !call->smpl_grp[igrp].nsmpl ) call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t)); call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i; call->smpl_grp[igrp].nsmpl++; @@ -745,7 +745,7 @@ static void mcall_set_ref_genotypes(call_t *call, int nals_ori) static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp) { int ia, ib, i; - int ngts_ori = nals_ori*(nals_ori+1)/2; + int ngts_ori = nals_ori*(nals_ori+1)/2; int ngts_new = call->nals_new*(call->nals_new+1)/2; int nsmpl = grp->nsmpl; @@ -1271,8 +1271,9 @@ void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) { assert( call->tgt_als->n ); - if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n); hts_expand(char*,call->tgt_als->n+1,call->nals,call->als); + hts_expand(int,call->tgt_als->n+1,call->nals_map,call->als_map); + hts_expand(int,(call->tgt_als->n+1)*(call->tgt_als->n+2)/2,call->npl_map,call->pl_map); int has_new = 0; @@ -1290,18 +1291,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) { call->als[nals] = call->tgt_als->allele[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); - - // if ( j+1==*unseen ) - // { - // fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen); - // int k; - // for (k=0; kn_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]); - // fprintf(stderr,"\tTAB="); - // for (k=0; ktgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); - // fprintf(stderr,"\n"); - // return -1; - // } - if ( j>=0 ) { // existing allele @@ -1537,9 +1526,9 @@ int mcall(call_t *call, bcf1_t *rec) bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag if ( nals_ori > 8*sizeof(call->als_new) ) - { + { fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; + return 0; } // For each group find the best combination of alleles @@ -1596,9 +1585,9 @@ int mcall(call_t *call, bcf1_t *rec) for (i=0; inals_new; i++) call->ac[i] = 0; if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 ) - { + { fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; + return 0; } if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) { @@ -1670,7 +1659,7 @@ int mcall(call_t *call, bcf1_t *rec) anno16_t a; float tmpf[4]; int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0; - if ( is_tested ) + if ( is_tested ) { for (i=0; i<4; i++) tmpf[i] = a.p[i]; bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4); diff --git a/bcftools/mcall.c.pysam.c b/bcftools/mcall.c.pysam.c index ea57344..bf3806f 100644 --- a/bcftools/mcall.c.pysam.c +++ b/bcftools/mcall.c.pysam.c @@ -2,7 +2,7 @@ /* mcall.c -- multiallelic and rare variant calling. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek @@ -316,7 +316,7 @@ static void init_sample_groups(call_t *call) while ( *ptr && isspace(*ptr) ) ptr++; if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]); *tmp = 0; - int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); + int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]); if ( ismpl<0 ) continue; if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups); if ( !khash_str2int_has_key(grp2idx,ptr+1) ) @@ -338,7 +338,7 @@ static void init_sample_groups(call_t *call) { if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups); int igrp = smpl2grp[i] - 1; - if ( !call->smpl_grp[igrp].nsmpl ) + if ( !call->smpl_grp[igrp].nsmpl ) call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t)); call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i; call->smpl_grp[igrp].nsmpl++; @@ -747,7 +747,7 @@ static void mcall_set_ref_genotypes(call_t *call, int nals_ori) static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp) { int ia, ib, i; - int ngts_ori = nals_ori*(nals_ori+1)/2; + int ngts_ori = nals_ori*(nals_ori+1)/2; int ngts_new = call->nals_new*(call->nals_new+1)/2; int nsmpl = grp->nsmpl; @@ -1273,8 +1273,9 @@ void mcall_trim_and_update_numberR(call_t *call, bcf1_t *rec, int nals_ori, int static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) { assert( call->tgt_als->n ); - if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n); hts_expand(char*,call->tgt_als->n+1,call->nals,call->als); + hts_expand(int,call->tgt_als->n+1,call->nals_map,call->als_map); + hts_expand(int,(call->tgt_als->n+1)*(call->tgt_als->n+2)/2,call->npl_map,call->pl_map); int has_new = 0; @@ -1292,18 +1293,6 @@ static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen) { call->als[nals] = call->tgt_als->allele[i]; j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]); - - // if ( j+1==*unseen ) - // { - // fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen); - // int k; - // for (k=0; kn_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]); - // fprintf(bcftools_stderr,"\tTAB="); - // for (k=0; ktgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]); - // fprintf(bcftools_stderr,"\n"); - // return -1; - // } - if ( j>=0 ) { // existing allele @@ -1539,9 +1528,9 @@ int mcall(call_t *call, bcf1_t *rec) bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag if ( nals_ori > 8*sizeof(call->als_new) ) - { + { fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; + return 0; } // For each group find the best combination of alleles @@ -1598,9 +1587,9 @@ int mcall(call_t *call, bcf1_t *rec) for (i=0; inals_new; i++) call->ac[i] = 0; if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 ) - { + { fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1); - return 0; + return 0; } if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) ) { @@ -1672,7 +1661,7 @@ int mcall(call_t *call, bcf1_t *rec) anno16_t a; float tmpf[4]; int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0; - if ( is_tested ) + if ( is_tested ) { for (i=0; i<4; i++) tmpf[i] = a.p[i]; bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4); diff --git a/bcftools/mpileup.c b/bcftools/mpileup.c index fc4f4b1..9b21b18 100644 --- a/bcftools/mpileup.c +++ b/bcftools/mpileup.c @@ -68,7 +68,8 @@ typedef struct _mplp_pileup_t mplp_pileup_t; // Data shared by all bam files typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, - max_indel_depth, max_read_len, fmt_flag, ambig_reads; + max_indel_depth, max_read_len, ambig_reads; + uint32_t fmt_flag; int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels @@ -97,6 +98,7 @@ typedef struct { bcf1_t *bcf_rec; htsFile *bcf_fp; bcf_hdr_t *bcf_hdr; + int indels_v20; int argc; char **argv; } mplp_conf_t; @@ -294,24 +296,23 @@ static int mplp_func(void *data, bam1_t *b) // We cache sample information here so we don't have to keep recomputing this // on each and every pileup column. If FMT/SCR annotation is requested, a flag // is set to indicate the presence of a soft clip. -// -// Cd is an arbitrary block of data we can write into, which ends up in -// the pileup structures. We stash the sample ID there: -// has_soft_clip .. cd->i & 1 -// sample_id .. cd->i >> 1 static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + cd->p = calloc(1,sizeof(plp_cd_t)); + + PLP_NM(cd) = PLP_NM_UNSET; + mplp_aux_t *ma = (mplp_aux_t *)data; int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); - cd->i = 0; - PLP_SET_SAMPLE_ID(cd->i, n); + PLP_SET_SAMPLE_ID(cd, n); + // Whether read has a soft-clip is used in mplp_realn's heuristics. // TODO: consider whether clip length is beneficial to use? int i; for (i=0; icore.n_cigar; i++) { int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; if (cig == BAM_CSOFT_CLIP) { - PLP_SET_SOFT_CLIP(cd->i); + PLP_SET_SOFT_CLIP(cd); break; } } @@ -330,7 +331,7 @@ static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) // Possible further optimsation, check tot_ins==1 later // (and remove break) so we can detect single bp indels. // We may want to focus BAQ on more complex regions only. - PLP_SET_INDEL(cd->i); + PLP_SET_INDEL(cd); break; } @@ -345,6 +346,11 @@ static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) return 0; } +static int pileup_destructor(void *data, const bam1_t *b, bam_pileup_cd *cd) +{ + free(cd->p); + return 0; +} static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp) { @@ -355,7 +361,7 @@ static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, c for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position { const bam_pileup1_t *p = plp[i] + j; - int id = PLP_SAMPLE_ID(p->cd.i); + int id = PLP_SAMPLE_ID(&(p->cd)); if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; @@ -418,11 +424,11 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, nt += n_plp[i]; for (j = 0; j < n_plp[i]; j++) { // iterate over reads bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j; - has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0; + has_indel += (PLP_HAS_INDEL(&p->cd) || p->indel) ? 1 : 0; // Has_clip is almost always true for very long reads // (eg PacBio CCS), but these rarely matter as the clip // is likely a long way from this indel. - has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0; + has_clip += (PLP_HAS_SOFT_CLIP(&p->cd)) ? 1 : 0; if (max_indel < p->indel) max_indel = p->indel; if (min_indel > p->indel) @@ -453,9 +459,8 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, // We could use our own structure (p->cd.p), allocated during // the constructor, but for simplicity we play dirty and // abuse an unused flag bit instead. - if (b->core.flag & 32768) - continue; - b->core.flag |= 32768; + if ( PLP_IS_REALN(&(p->cd)) ) continue; + PLP_SET_REALN(&(p->cd)); if (b->core.l_qseq > max_read_len) continue; @@ -553,8 +558,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) } int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); if (has_ref && (conf->flag & MPLP_REALN)) - mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, - conf->max_read_len, ref, ref_len, pos); + mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, conf->max_read_len, ref, ref_len, pos); int total_depth, _ref0, ref16; for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; @@ -567,23 +571,30 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) conf->bc.tid = tid; conf->bc.pos = pos; bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, - conf->bca, 0); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, 0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth - && (bcf_callaux_clean(conf->bca, &conf->bc), - bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) + if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth ) { - for (i = 0; i < conf->gplp->n; ++i) - bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); - if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + bcf_callaux_clean(conf->bca, &conf->bc); + conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL; + int iret; + if ( conf->indels_v20 ) + iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref); + else + iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref); + if ( iret>=0 ) { - bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); - flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + for (i = 0; i < conf->gplp->n; ++i) + bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); + if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + { + bcf_clear1(conf->bcf_rec); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + } } } } @@ -765,40 +776,38 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##ALT="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_IDV ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_IMF ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_VDB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if (conf->fmt_flag & B2B_INFO_ZSCORE) { - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_RPBZ ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MQBZ ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_BQBZ ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MQSBZ ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_FMT_NMBZ ) - bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_SCB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - } else { - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - } - - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -#if CDF_MWU_TESTS - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -#endif - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MIN_PL_SUM ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_NM ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_NMBZ ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_FMT_NMBZ ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_INFO_SCBZ ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_FS ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_SGB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MQ0F ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); @@ -852,6 +861,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; conf->bca->indel_win_size = conf->indel_win_size; + conf->bca->indels_v20 = conf->indels_v20; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -902,6 +912,8 @@ static int mpileup(mplp_conf_t *conf) conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); bam_mplp_constructor(conf->iter, pileup_constructor); + bam_mplp_destructor(conf->iter, pileup_destructor); + // Run mpileup for multiple regions if ( nregs ) @@ -1045,38 +1057,68 @@ int read_file_list(const char *file_list,int *n,char **argv[]) } #undef MAX_PATH_LEN -int parse_format_flag(const char *str) +#define SET_FMT_FLAG(str,bit,msg) \ + if (!strcasecmp(tag,str) || !strcasecmp(tag,"FMT/"str) || !strcasecmp(tag,"FORMAT/"str)) \ + { \ + if ( *msg ) fprintf(stderr,"%s",msg); \ + if ( exclude ) \ + *flag &= ~bit; \ + else \ + *flag |= bit; \ + free(tags[i]); \ + continue; \ + } +#define SET_INFO_FLAG(str,bit,msg) if (!strcasecmp(tag,"INFO/"str)) \ + { \ + if ( exclude ) \ + *flag &= ~bit; \ + else \ + *flag |= bit; \ + free(tags[i]); \ + continue; \ + } + +void parse_format_flag(uint32_t *flag, const char *str) { - int i, flag = 0, n_tags; + int i, n_tags; char **tags = hts_readlist(str, 0, &n_tags); for(i=0; iindel_bias); fprintf(fp, " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); + fprintf(fp, + " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n"); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -1240,7 +1300,7 @@ int main_mpileup(int argc, char *argv[]) mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); // the default to be changed in future, see also parse_format_flag() - mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; mplp.indel_win_size = 110; @@ -1302,6 +1362,7 @@ int main_mpileup(int argc, char *argv[]) {"gap-frac", required_argument, NULL, 'F'}, {"indel-bias", required_argument, NULL, 10}, {"indel-size", required_argument, NULL, 15}, + {"indels-2.0", no_argument, NULL, 20}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1311,7 +1372,6 @@ int main_mpileup(int argc, char *argv[]) {"platforms", required_argument, NULL, 'P'}, {"max-read-len", required_argument, NULL, 'M'}, {"config", required_argument, NULL, 'X'}, - {"mwu-u", no_argument, NULL, 'U'}, {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, @@ -1436,6 +1496,7 @@ int main_mpileup(int argc, char *argv[]) } } break; + case 20: mplp.indels_v20 = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1446,10 +1507,9 @@ int main_mpileup(int argc, char *argv[]) list_annotations(stderr); return 1; } - mplp.fmt_flag |= parse_format_flag(optarg); + parse_format_flag(&mplp.fmt_flag,optarg); break; case 'M': mplp.max_read_len = atoi(optarg); break; - case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break; case 'X': if (strcasecmp(optarg, "pacbio-ccs") == 0) { mplp.min_frac = 0.1; diff --git a/bcftools/mpileup.c.pysam.c b/bcftools/mpileup.c.pysam.c index 6e0ae5b..724a0ec 100644 --- a/bcftools/mpileup.c.pysam.c +++ b/bcftools/mpileup.c.pysam.c @@ -70,7 +70,8 @@ typedef struct _mplp_pileup_t mplp_pileup_t; // Data shared by all bam files typedef struct { int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth, - max_indel_depth, max_read_len, fmt_flag, ambig_reads; + max_indel_depth, max_read_len, ambig_reads; + uint32_t fmt_flag; int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type; int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels double min_frac; // for indels @@ -99,6 +100,7 @@ typedef struct { bcf1_t *bcf_rec; htsFile *bcf_fp; bcf_hdr_t *bcf_hdr; + int indels_v20; int argc; char **argv; } mplp_conf_t; @@ -296,24 +298,23 @@ static int mplp_func(void *data, bam1_t *b) // We cache sample information here so we don't have to keep recomputing this // on each and every pileup column. If FMT/SCR annotation is requested, a flag // is set to indicate the presence of a soft clip. -// -// Cd is an arbitrary block of data we can write into, which ends up in -// the pileup structures. We stash the sample ID there: -// has_soft_clip .. cd->i & 1 -// sample_id .. cd->i >> 1 static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + cd->p = calloc(1,sizeof(plp_cd_t)); + + PLP_NM(cd) = PLP_NM_UNSET; + mplp_aux_t *ma = (mplp_aux_t *)data; int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b); - cd->i = 0; - PLP_SET_SAMPLE_ID(cd->i, n); + PLP_SET_SAMPLE_ID(cd, n); + // Whether read has a soft-clip is used in mplp_realn's heuristics. // TODO: consider whether clip length is beneficial to use? int i; for (i=0; icore.n_cigar; i++) { int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK; if (cig == BAM_CSOFT_CLIP) { - PLP_SET_SOFT_CLIP(cd->i); + PLP_SET_SOFT_CLIP(cd); break; } } @@ -332,7 +333,7 @@ static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) // Possible further optimsation, check tot_ins==1 later // (and remove break) so we can detect single bp indels. // We may want to focus BAQ on more complex regions only. - PLP_SET_INDEL(cd->i); + PLP_SET_INDEL(cd); break; } @@ -347,6 +348,11 @@ static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) return 0; } +static int pileup_destructor(void *data, const bam1_t *b, bam_pileup_cd *cd) +{ + free(cd->p); + return 0; +} static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp) { @@ -357,7 +363,7 @@ static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, c for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position { const bam_pileup1_t *p = plp[i] + j; - int id = PLP_SAMPLE_ID(p->cd.i); + int id = PLP_SAMPLE_ID(&(p->cd)); if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; @@ -420,11 +426,11 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, nt += n_plp[i]; for (j = 0; j < n_plp[i]; j++) { // iterate over reads bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j; - has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0; + has_indel += (PLP_HAS_INDEL(&p->cd) || p->indel) ? 1 : 0; // Has_clip is almost always true for very long reads // (eg PacBio CCS), but these rarely matter as the clip // is likely a long way from this indel. - has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0; + has_clip += (PLP_HAS_SOFT_CLIP(&p->cd)) ? 1 : 0; if (max_indel < p->indel) max_indel = p->indel; if (min_indel > p->indel) @@ -455,9 +461,8 @@ static void mplp_realn(int n, int *n_plp, const bam_pileup1_t **plp, // We could use our own structure (p->cd.p), allocated during // the constructor, but for simplicity we play dirty and // abuse an unused flag bit instead. - if (b->core.flag & 32768) - continue; - b->core.flag |= 32768; + if ( PLP_IS_REALN(&(p->cd)) ) continue; + PLP_SET_REALN(&(p->cd)); if (b->core.l_qseq > max_read_len) continue; @@ -555,8 +560,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) } int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len); if (has_ref && (conf->flag & MPLP_REALN)) - mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, - conf->max_read_len, ref, ref_len, pos); + mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, conf->max_read_len, ref, ref_len, pos); int total_depth, _ref0, ref16; for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i]; @@ -569,23 +573,30 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) conf->bc.tid = tid; conf->bc.pos = pos; bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc); bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, - conf->bca, 0); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, 0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them - if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth - && (bcf_callaux_clean(conf->bca, &conf->bc), - bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) + if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth ) { - for (i = 0; i < conf->gplp->n; ++i) - bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); - if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + bcf_callaux_clean(conf->bca, &conf->bc); + conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL; + int iret; + if ( conf->indels_v20 ) + iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref); + else + iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref); + if ( iret>=0 ) { - bcf_clear1(conf->bcf_rec); - bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); - flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + for (i = 0; i < conf->gplp->n; ++i) + bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i); + if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0) + { + bcf_clear1(conf->bcf_rec); + bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref); + flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec); + } } } } @@ -767,40 +778,38 @@ static int mpileup(mplp_conf_t *conf) bcf_hdr_append(conf->bcf_hdr,"##ALT="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_IDV ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_IMF ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); if ( conf->fmt_flag&B2B_INFO_VDB ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if (conf->fmt_flag & B2B_INFO_ZSCORE) { - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_RPBZ ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MQBZ ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_BQBZ ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MQSBZ ) bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - if ( conf->fmt_flag&B2B_FMT_NMBZ ) - bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); - if ( conf->fmt_flag&B2B_INFO_SCB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - } else { - if ( conf->fmt_flag&B2B_INFO_RPB ) - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - } - - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -#if CDF_MWU_TESTS - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); -#endif - bcf_hdr_append(conf->bcf_hdr,"##INFO="); - bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MIN_PL_SUM ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_NM ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_NMBZ ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_FMT_NMBZ ) + bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); + if ( conf->fmt_flag&B2B_INFO_SCBZ ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_FS ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_SGB ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); + if ( conf->fmt_flag&B2B_INFO_MQ0F ) + bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##INFO="); bcf_hdr_append(conf->bcf_hdr,"##FORMAT="); @@ -854,6 +863,7 @@ static int mpileup(mplp_conf_t *conf) conf->bca->fmt_flag = conf->fmt_flag; conf->bca->ambig_reads = conf->ambig_reads; conf->bca->indel_win_size = conf->indel_win_size; + conf->bca->indels_v20 = conf->indels_v20; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; @@ -904,6 +914,8 @@ static int mpileup(mplp_conf_t *conf) conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); bam_mplp_constructor(conf->iter, pileup_constructor); + bam_mplp_destructor(conf->iter, pileup_destructor); + // Run mpileup for multiple regions if ( nregs ) @@ -1047,38 +1059,68 @@ int read_file_list(const char *file_list,int *n,char **argv[]) } #undef MAX_PATH_LEN -int parse_format_flag(const char *str) +#define SET_FMT_FLAG(str,bit,msg) \ + if (!strcasecmp(tag,str) || !strcasecmp(tag,"FMT/"str) || !strcasecmp(tag,"FORMAT/"str)) \ + { \ + if ( *msg ) fprintf(bcftools_stderr,"%s",msg); \ + if ( exclude ) \ + *flag &= ~bit; \ + else \ + *flag |= bit; \ + free(tags[i]); \ + continue; \ + } +#define SET_INFO_FLAG(str,bit,msg) if (!strcasecmp(tag,"INFO/"str)) \ + { \ + if ( exclude ) \ + *flag &= ~bit; \ + else \ + *flag |= bit; \ + free(tags[i]); \ + continue; \ + } + +void parse_format_flag(uint32_t *flag, const char *str) { - int i, flag = 0, n_tags; + int i, n_tags; char **tags = hts_readlist(str, 0, &n_tags); for(i=0; iindel_bias); fprintf(fp, " --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size); + fprintf(fp, + " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n"); fprintf(fp,"\n"); fprintf(fp, "Configuration profiles activated with -X, --config:\n" @@ -1242,7 +1302,7 @@ int main_mpileup(int argc, char *argv[]) mplp.n_threads = 0; mplp.bsmpl = bam_smpl_init(); // the default to be changed in future, see also parse_format_flag() - mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; mplp.indel_win_size = 110; @@ -1304,6 +1364,7 @@ int main_mpileup(int argc, char *argv[]) {"gap-frac", required_argument, NULL, 'F'}, {"indel-bias", required_argument, NULL, 10}, {"indel-size", required_argument, NULL, 15}, + {"indels-2.0", no_argument, NULL, 20}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, @@ -1313,7 +1374,6 @@ int main_mpileup(int argc, char *argv[]) {"platforms", required_argument, NULL, 'P'}, {"max-read-len", required_argument, NULL, 'M'}, {"config", required_argument, NULL, 'X'}, - {"mwu-u", no_argument, NULL, 'U'}, {"seed", required_argument, NULL, 13}, {"ambig-reads", required_argument, NULL, 14}, {"ar", required_argument, NULL, 14}, @@ -1438,6 +1498,7 @@ int main_mpileup(int argc, char *argv[]) } } break; + case 20: mplp.indels_v20 = 1; break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; @@ -1448,10 +1509,9 @@ int main_mpileup(int argc, char *argv[]) list_annotations(bcftools_stderr); return 1; } - mplp.fmt_flag |= parse_format_flag(optarg); + parse_format_flag(&mplp.fmt_flag,optarg); break; case 'M': mplp.max_read_len = atoi(optarg); break; - case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break; case 'X': if (strcasecmp(optarg, "pacbio-ccs") == 0) { mplp.min_frac = 0.1; diff --git a/bcftools/read_consensus.c b/bcftools/read_consensus.c new file mode 100644 index 0000000..5c8133f --- /dev/null +++ b/bcftools/read_consensus.c @@ -0,0 +1,804 @@ +/* read_consensus.c -- create and maintain consensus of reads + + Copyright (C) 2022 Genome Research Ltd. + + Author: pd3@sanger + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include "bcftools.h" +#include "read_consensus.h" +#include "cigar_state.h" +#include "kheap.h" + + +// Frequency arrays for each variant type +#define NI 10 // number of alternative insertion sequences at one position in a single sample +typedef struct +{ + char *nt16_seq[NI]; + int len[NI]; + int freq[NI]; +} +ins_freq_t; + +typedef struct +{ + int len[NI]; + int freq[NI]; +} +del_freq_t; + +#define BF_DEL 5 +typedef struct +{ + int base[6]; // frequencies of A,C,G,T,N,deletion +} +base_freq_t; + + +// Candidate variants for each interesting position to build consensus haplotypes +enum variant_type { snv, ins, del, done }; +typedef struct +{ + enum variant_type vtype; + hts_pos_t pos; // variant position (reference sequence coordinates), indels follow VCF convention + int idx; // temporary 0-based index to rcns.cvar + int which, // base/ins/del in rcns.[base|ins|del]_freq array + depth; // coverage at the position + float af, af_dev; // variant allele frequency (just for debugging printout) and absolute af deviation from 0.5 +} +candidate_var_t; +static inline int cvar_not_preferred(candidate_var_t *a, candidate_var_t *b) +{ + if ( a->af_dev == b->af_dev ) return a->depth < b->depth ? 1 : 0; + return a->af_dev > b->af_dev ? 1 : 0; +} +KHEAP_INIT(cvh, candidate_var_t, cvar_not_preferred); +typedef khp_cvh_t cvar_heap_t; + +#define MAX_NCVAR 8 // This results in alloc() of 2^MAX_NCVAR possible haplotypes +#define NHAP (1<mfreq; i++) + { + ins_freq_t *ifrq = &rcns->ins_freq[i]; + for (j=0; jnt16_seq[j]; j++) free(ifrq->nt16_seq[j]); + } + for (i=0; i<2; i++) + free(rcns->cns[i].seq); + free(rcns->ins_freq); + free(rcns->del_freq); + free(rcns->base_freq); + free(rcns->stmp); + khp_destroy(cvh,rcns->cv_heap); + free(rcns); +} +static int init_arrays(read_cns_t *rcns) +{ + int i,j,n = rcns->end - rcns->beg + 1; + if ( n > rcns->mfreq ) + { + ins_freq_t *ifrq = (ins_freq_t*) realloc(rcns->ins_freq,sizeof(*rcns->ins_freq)*n); + if ( !ifrq ) return -1; + rcns->ins_freq = ifrq; + memset(ifrq+rcns->mfreq,0,sizeof(*rcns->ins_freq)*(n-rcns->mfreq)); + + del_freq_t *dfrq = (del_freq_t*) realloc(rcns->del_freq,sizeof(*rcns->del_freq)*n); + if ( !dfrq ) return -1; + rcns->del_freq = dfrq; + memset(dfrq+rcns->mfreq,0,sizeof(*rcns->del_freq)*(n-rcns->mfreq)); + + base_freq_t *bfrq = (base_freq_t*) realloc(rcns->base_freq,sizeof(*rcns->base_freq)*n); + if ( !bfrq ) return -1; + rcns->base_freq = bfrq; + memset(bfrq+rcns->mfreq,0,sizeof(*rcns->base_freq)*(n-rcns->mfreq)); + + rcns->mfreq = n; + } + memset(rcns->base_freq,0,sizeof(*rcns->base_freq)*n); + memset(rcns->del_freq,0,sizeof(*rcns->del_freq)*n); + for (i=0; iins_freq[i]; + for (j=0; jnt16_seq[j]; j++) free(ifrq->nt16_seq[j]); + } + memset(rcns->ins_freq,0,sizeof(*rcns->ins_freq)*n); + return 0; +} +int rcns_reset(read_cns_t *rcns, hts_pos_t pos, hts_pos_t beg, hts_pos_t end) +{ + rcns->band = 0; + rcns->pos = pos; + rcns->beg = beg; + rcns->end = end; + int i; + for (i=0; i<2; i++) rcns->cns[i].nseq = rcns->cns[i].ipos = 0; + // this should not be necessary if the caller did run all steps + while (rcns->cv_heap->ndat) khp_delete(cvh, rcns->cv_heap); + return init_arrays(rcns); +} + +static inline void add_base(read_cns_t *rcns, int ref_pos, int nt16) +{ + int i = ref_pos - rcns->beg; + rcns->base_freq[i].base[seq_nt16_int[nt16]]++; +} +static void add_ins(read_cns_t *rcns, int ref_pos, int seq_pos, uint8_t *raw_seq, int len) +{ + int i = ref_pos - rcns->beg; + ins_freq_t *ifrq = &rcns->ins_freq[i]; + char *str; + if ( rcns->mstmp < len ) + { + str = realloc(rcns->stmp,len*sizeof(*str)); + if ( !str ) return; + rcns->mstmp = len; + rcns->stmp = str; + } + else + str = rcns->stmp; + for (i=0; int16_seq[i]; i++) + if ( ifrq->len[i]==len && !memcmp(ifrq->nt16_seq[i],str,len) ) break; + + if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard + + if ( !ifrq->nt16_seq[i] ) // new insertion + { + if ( !(ifrq->nt16_seq[i]=malloc(len)) ) return; + memcpy(ifrq->nt16_seq[i], str, len); + ifrq->len[i] = len; + } + ifrq->freq[i]++; +} +static void add_del(read_cns_t *rcns, int ref_pos, int len) +{ + int i = ref_pos - rcns->beg; + int j,n = rcns->end - rcns->beg + 1; + if ( i + len + 1 < n ) n = i + len + 1; + for (j=i+1; jbase_freq[j].base[BF_DEL]++; + + del_freq_t *dfrq = &rcns->del_freq[i]; + for (i=0; ilen[i]; i++) + if ( dfrq->len[i]==len ) break; + + if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard + + if ( !dfrq->len[i] ) dfrq->len[i] = len; // new deletion + dfrq->freq[i]++; +} + +read_cns_t *rcns_init(hts_pos_t pos, hts_pos_t beg, hts_pos_t end) +{ + read_cns_t *rcns = (read_cns_t*) calloc(1,sizeof(read_cns_t)); + rcns->pos = pos; + rcns->beg = beg; + rcns->end = end; + rcns->cv_heap = khp_init(cvh); + if ( init_arrays(rcns)!=0 ) + { + rcns_destroy(rcns); + return NULL; + } + return rcns; +} + +int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp) +{ + // save the reads for phasing, this can be called multiple times + rcns->plp = plp; + rcns->nplp = nplp; + + // fill consensus arrays + int i,j,k, local_band_max = 0; // maximum absolute deviation from diagonal + for (i=0; ib; + int x = b->core.pos; // ref coordinate + int y = 0; // seq coordinate + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int local_band = 0; // current deviation from diagonal + for (k = 0; k < b->core.n_cigar; ++k) + { + int op = cigar[k] & BAM_CIGAR_MASK; + int len = cigar[k] >> BAM_CIGAR_SHIFT; + if ( op==BAM_CSOFT_CLIP ) y += len; + else if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF ) + { + if ( xend && x+len>rcns->beg ) + { + int j_beg = rcns->beg > x ? rcns->beg - x : 0; // how many bases to skip in the ref and qry + int j_end = rcns->end < x + len - 1 ? rcns->end - x : len - 1; + x += j_beg; + y += j_beg; + for (j=j_beg; j<=j_end; j++, x++, y++) add_base(rcns,x,bam_seqi(seq,y)); + } + else + { + x += len; + y += len; + } + } + else if ( op==BAM_CINS ) + { + if ( x>rcns->beg && xend ) + { + local_band += p->indel; + add_ins(rcns,x-1,y,seq,len); // x-1: one base before as in VCF + } + y += len; + } + else if ( op==BAM_CDEL ) + { + if ( x>rcns->beg && x+len-1<=rcns->end ) + { + local_band += -p->indel; + add_del(rcns,x-1,len); // x-1: one base before as in VCF + } + x += len; + } + else if ( op==BAM_CHARD_CLIP ) continue; + else error("rcns_set_reads todo: unknown cigar operator %d\n",op); + if ( local_band_max < local_band ) local_band_max = local_band; + } + + // Track the biggest deviation +/- from diagonal, used in BAQ alignment step. + if ( rcns->band < local_band_max ) rcns->band = local_band_max; + } + + return 0; +} + +#if DEBUG_RCNS +static void debug_print_base_freqs(read_cns_t *rcns, const char *ref) +{ + int i,j,k,n = rcns->end - rcns->beg + 1; + fprintf(stderr,"beg,end,pos=%d %d %d\n",(int)rcns->beg,(int)rcns->end,(int)rcns->pos); + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + del_freq_t *dfreq = rcns->del_freq; + for (i=0; ibeg+i+1,ref[i]); + for (j=0; j<6; j++) + fprintf(stderr,"\t%d%s",bfreq[i].base[j],ref[i]=="ACGTNi"[j]?"*":""); + fprintf(stderr,"\t"); + for (j=0; jncvar; i++) + { + candidate_var_t *var = &rcns->cvar[i]; + fprintf(stderr,"\tvar%d pos=%"PRIhts_pos" idx=%d vtype=%s which=%d depth=%d af=%f af_dev=%f\n", + i,var->pos+1,var->idx,vtype2string(var->vtype),var->which,var->depth,var->af,var->af_dev); + } +} +static void debug_print_haplotype_frequency_spectrum(read_cns_t *rcns) +{ + int i,j; + fprintf(stderr,"Haplotype frequencies (bits from left correspond to var0,1,..):\n"); + for (i=0; ihap_freq[i] ) continue; + fprintf(stderr,"\t%d: ",i); + for (j=0; jncvar; j++) + fprintf(stderr,"%d", i&(1<hap_freq[i]); + } +} +static void debug_print_consensus(read_cns_t *rcns, const char *ref) +{ + int i,j,n = rcns->end - rcns->beg + 1; + fprintf(stderr,"ref: "); + for (i=0; icns[i].nseq ) break; + fprintf(stderr,"Consensus%d: ",i); + for (j=0; j<=rcns->cns[i].ipos; j++) + fprintf(stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]); + fprintf(stderr,"#"); + for (; jcns[i].nseq; j++) + fprintf(stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]); + fprintf(stderr,"\n"); + } +} +#else +#define debug_print_base_freqs(rcns,ref) +#define debug_print_candidate_variants(rcns) +#define debug_print_haplotype_frequency_spectrum(rcns) +#define debug_print_consensus(rcns,ref) +#endif + +static int cvar_pos_cmp(const void *aptr, const void *bptr) +{ + candidate_var_t *a = (candidate_var_t*)aptr; + candidate_var_t *b = (candidate_var_t*)bptr; + if ( a->pos < b->pos ) return -1; + if ( a->pos > b->pos ) return 1; + if ( a->vtype < b->vtype ) return -1; + if ( a->vtype > b->vtype ) return 1; + if ( a->which < b->which ) return -1; + if ( a->which > b->which ) return 1; + return 0; +} +static void register_variant(read_cns_t *rcns, enum variant_type vtype, int cns_pos, int which, int depth, float freq) +{ + cvar_heap_t *cv_heap = rcns->cv_heap; + if ( vtype==done ) + { + rcns->ncvar = 0; + while (cv_heap->ndat) + { + rcns->cvar[rcns->ncvar++] = cv_heap->dat[0]; + khp_delete(cvh,cv_heap); + } + // sort the variants by pos,type,which to make determination of haplotypes from reads faster + if ( rcns->ncvar ) + qsort(rcns->cvar, rcns->ncvar, sizeof(*rcns->cvar), cvar_pos_cmp); + return; + } + + candidate_var_t var; + var.pos = cns_pos + rcns->beg; + var.which = which; + var.vtype = vtype; + var.depth = depth; + var.af_dev = fabs(0.5-freq); + var.af = freq; + + int free_slot; + + // keep the number of variants small, maximum MAX_NCVAR + if ( rcns->ncvar==MAX_NCVAR ) + { + if ( cvar_not_preferred(&var,&cv_heap->dat[0]) ) return; // no need to add, the new variant is worse than the heap's worst one + free_slot = cv_heap->dat[0].idx; + khp_delete(cvh,cv_heap); + } + else + free_slot = rcns->ncvar++; + var.idx = free_slot; + rcns->cvar[free_slot] = var; + khp_insert(cvh,cv_heap,&var); +} + +// Identify candidate variant positions. (Note that homozygous variants are not considered +// as those will be added trivially by taking the consensus base.) The detection limit is +// for now hard-wired. This has only indirect effect on sensitivity, will just not contribute +// to the consensus template when realigning. +static int select_candidate_variants(read_cns_t *rcns, const char *ref) +{ + const float af_th = 0.1; + int i,j, n = rcns->end - rcns->beg + 1; + int max_ins_len = 0; // maximum total length of all insertions applied to allocate big enough buffers + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + del_freq_t *dfreq = rcns->del_freq; + for (i=0; ipos - rcns->beg ) continue; // creating consensus from everything but the variants at the current position + + int dp = 0; + for (j=0; j<4; j++) dp += bfreq[i].base[j]; + for (j=0; jaf_th && af<(1-af_th) ) register_variant(rcns,snv,i,j,dp,af); + } + for (j=0; jaf_th && af<(1-af_th) ) register_variant(rcns,del,i,j,dp,af); + } + for (j=0; jaf_th && af<(1-af_th) ) register_variant(rcns,ins,i,j,dp,af); + } + } + register_variant(rcns,done,0,0,0,0); // finalize + + // Reallocate buffers + if ( rcns->mcns < n + max_ins_len ) + { + n += max_ins_len; + for (i=0; i<2; i++) + { + char *seq = (char*) realloc(rcns->cns[i].seq,sizeof(char)*n); + if ( !seq ) return -1; + rcns->cns[i].seq = seq; + } + rcns->mcns = n; + } + + // Find the longest deletion at the query position + i = rcns->pos - rcns->beg; + rcns->max_del = 0; + for (j=0; jmax_del < dfreq[i].len[j] ) rcns->max_del = dfreq[i].len[j]; + } + + return 0; +} +static int create_haplotype_frequency_spectrum(read_cns_t *rcns) +{ + memset(rcns->hap_freq,0,sizeof(rcns->hap_freq)); + + int i; + for (i=0; inplp; i++) // for each read... + { + const bam_pileup1_t *p = rcns->plp + i; + cigar_state_t cigar; + cstate_init(&cigar,p->b); + + int j,k,hap = 0; + for (j=0; jncvar; j++) + { + candidate_var_t *cvar = &rcns->cvar[j]; + if ( cvar->vtype==snv ) + { + int iseq = cstate_seek_op_fwd(&cigar, cvar->pos, BAM_CMATCH, NULL); + if ( iseq==-2 ) break; + if ( iseq==-1 ) continue; + int nt16 = bam_seqi(cigar.seq, iseq); + if ( seq_nt16_int[nt16]==cvar->which ) hap |= 1<vtype==ins ) + { + int len; + ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg]; + int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len); + if ( iseq==-2 ) break; + if ( iseq==-1 ) continue; + if ( len!=ifrq->len[cvar->which] ) continue; + for (k=0; klen[cvar->which]; k++) + if ( bam_seqi(cigar.seq,iseq+k)!=ifrq->nt16_seq[cvar->which][k] ) break; + if ( k==ifrq->len[cvar->which] ) hap |= 1<vtype==del ) + { + int len; + del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg]; + int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len); + if ( ret==-2 ) break; + if ( ret==-1 ) continue; + if ( len!=dfrq->len[cvar->which] ) continue; + hap |= 1<hap_freq[hap]++; + } + return 0; +} + +typedef struct +{ + int haplotype, count; +} +ii_t; + +static int ii_cmp(const void *a, const void *b) +{ + if ( ((ii_t*)a)->count > ((ii_t*)b)->count ) return -1; + if ( ((ii_t*)a)->count < ((ii_t*)b)->count ) return 1; + return 0; +} + +// Select two most common haplotypes trying to account for 1bp errors. Haplotypes +// are represented as 8-bit numbers, each bit corresponds to one candidate variant. +static int correct_haplotype_errors(read_cns_t *rcns) +{ + int i,j, tot = 0; + ii_t freq[NHAP]; + for (i=0; ihap_freq[i]; + tot += rcns->hap_freq[i]; + } + qsort(freq, NHAP, sizeof(ii_t), ii_cmp); // sort haplotypes in descending order + for (i=NHAP-1; i>1; i--) + { + if ( !freq[i].count ) continue; + if ( freq[1].count > tot - freq[0].count - freq[1].count ) break; // the top2 hapotypes cannot change anymore + + // Find a similar haplotype with the highest frequency. Assuming errors go in 0->1 + // direction only and considering one error only. + int count = freq[i].count, max_hap = 0; + for (j=0; j=0 && haphap_freq[hap] ) count = rcns->hap_freq[hap], max_hap = hap; + } + if ( count == freq[i].count ) continue; + + // Update frequency and sort the two modified elements + count = freq[i].count; + freq[i].count = 0; + rcns->hap_freq[freq[i].haplotype] = 0; + rcns->hap_freq[max_hap] += count; + for (j=i+1; j=0; j--) + { + if ( freq[j].haplotype==max_hap ) freq[j].count += count; // update the best matching haplotype + if ( freq[j].count < freq[j+1].count ) + { + ii_t tmp = freq[j]; freq[j] = freq[j+1]; freq[j+1] = tmp; + } + } + } + + // Use only one consensus if the next best haplotype is populated by less than 10% of reads + rcns->ncns = ((float)freq[1].count / (freq[0].count + freq[1].count) < 0.1) ? 1 : 2; + + // Remove unused candidate variants from the top two haplotypes + int hap0 = freq[0].haplotype; + int hap1 = rcns->ncns==2 ? freq[1].haplotype : 0; + rcns->cns_hap[0] = 0; + rcns->cns_hap[1] = 0; + for (i=0,j=0; icvar[j] = rcns->cvar[i]; + if ( hap0 & (1U<cns_hap[0] |= 1U<cns_hap[1] |= 1U<ncvar = j; + +#if DEBUG_RCNS + // This only matters for debugging print + memset(rcns->hap_freq,0,NHAP*sizeof(*rcns->hap_freq)); + rcns->hap_freq[rcns->cns_hap[1]] = freq[1].count; // NB: the order matters when ncns==1 + rcns->hap_freq[rcns->cns_hap[0]] = freq[0].count; +#endif + + return 0; +} + + +// Check how frequent are insertions adjacent to the j-th position. Note that reads with an +// insertion usually increment also bfreq counts at this position, but not necessarily so, +// therefore the counts are approximate +static inline void apply_consensus_insertion(read_cns_t *rcns, cns_seq_t *cns, int j, int ivar) +{ + // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos + hts_pos_t ref_pos = rcns->beg + j; + if ( rcns->pos == ref_pos ) return; + + // Only apply when there is no insertion at this position registered as a variant + while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos == ref_pos ) + { + if ( rcns->cvar[ivar].vtype == ins ) return; + ivar++; + } + + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + int k, nreads = 0; + for (k=0; k max_freq*2 ) return; + + int len = ifreq[j].len[kmax]; + char *seq = ifreq[j].nt16_seq[kmax]; + for (k=0; kseq[cns->nseq++] = seq_nt16_int[(int)seq[k]]; +} + +// For each position of the realignment window apply either the candidate variants +// from ith haplotype or decide on the base/ins/del by majority vote +static void create_consensus(read_cns_t *rcns, const char *ref, int ith) +{ + int n = rcns->end - rcns->beg + 1; + cns_seq_t *cns = &rcns->cns[ith]; + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + del_freq_t *dfreq = rcns->del_freq; + hts_pos_t prev_pos = 0; + int j,k, ivar = 0; + for (j=0; jbeg + j; + if ( rcns->pos == ref_pos ) cns->ipos = cns->nseq; + + while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos < ref_pos ) ivar++; + + if ( ivar >= rcns->ncvar || rcns->cvar[ivar].pos != ref_pos ) + { + // This position is not recognised as a het variant so take the most frequent base, including + // a deletion if that is most frequent. However, for deleted bases make sure they are not part + // of the deletion that is being tested at this positions + int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]]; + int nk = ( ref_pos < rcns->pos || ref_pos > rcns->pos + rcns->max_del ) ? BF_DEL+1 : BF_DEL; + for (k=0; kseq[cns->nseq++] = kmax; + } + // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos + apply_consensus_insertion(rcns, cns, j, ivar); + continue; + } + int which = rcns->cvar[ivar].which; + if ( !(rcns->cns_hap[ith] & (1U<cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue; + if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k; + } + if ( kmax!=BF_DEL && (!cns->nseq || prev_pos != ref_pos) ) + { + prev_pos = ref_pos; + cns->seq[cns->nseq++] = kmax; + } + apply_consensus_insertion(rcns, cns, j, ivar); + continue; + } + if ( rcns->cvar[ivar].vtype == snv ) + { + prev_pos = ref_pos; + cns->seq[cns->nseq++] = which; + apply_consensus_insertion(rcns, cns, j, ivar); + continue; + } + + // There can be multiple variants at this position, for example snv+ins. SNVs come first + // thanks to cvar_pos_cmp(), make sure the base has not been added already. + if ( !cns->nseq || prev_pos != ref_pos ) + { + int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]]; + for (k=0; k<6; k++) + { + if ( rcns->cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue; + if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k; + } + if ( kmax!=BF_DEL ) + { + prev_pos = ref_pos; + cns->seq[cns->nseq++] = kmax; + } + } + if ( rcns->cvar[ivar].vtype == ins ) + { + int len = ifreq[j].len[which]; + char *seq = ifreq[j].nt16_seq[which]; + for (k=0; kseq[cns->nseq++] = seq_nt16_int[(int)seq[k]]; + } + } + else if ( rcns->cvar[ivar].vtype == del ) j += dfreq[j].len[which]; + } +} + +// The algorithm: +// 1. Identify heterozygous variant positions +// 2. Sort variants by abs(variant_allele_freq-0.5) in descending order +// 3. Take the top sorted variants (up to 8 to fit in uint8_t) and count the number of +// corresponding reads to create frequency spectrum +// 4. Correct errors, collapse to the requested number of haplotypes (consensus sequences) +// using majority vote for the distribution tail +cns_seq_t *rcns_get_consensus(read_cns_t *rcns, const char *ref) +{ + debug_print_base_freqs(rcns, ref); + + select_candidate_variants(rcns, ref); + debug_print_candidate_variants(rcns); + + if ( rcns->ncvar ) + { + create_haplotype_frequency_spectrum(rcns); + debug_print_haplotype_frequency_spectrum(rcns); + + correct_haplotype_errors(rcns); + debug_print_candidate_variants(rcns); + debug_print_haplotype_frequency_spectrum(rcns); + } + else + { + rcns->cns_hap[0] = 0; + rcns->ncns = 1; + } + + // create consensus + int i; + for (i=0; incns; i++) create_consensus(rcns,ref,i); + debug_print_consensus(rcns,ref); + + return rcns->cns; +} diff --git a/bcftools/read_consensus.c.pysam.c b/bcftools/read_consensus.c.pysam.c new file mode 100644 index 0000000..a2612fd --- /dev/null +++ b/bcftools/read_consensus.c.pysam.c @@ -0,0 +1,806 @@ +#include "bcftools.pysam.h" + +/* read_consensus.c -- create and maintain consensus of reads + + Copyright (C) 2022 Genome Research Ltd. + + Author: pd3@sanger + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include "bcftools.h" +#include "read_consensus.h" +#include "cigar_state.h" +#include "kheap.h" + + +// Frequency arrays for each variant type +#define NI 10 // number of alternative insertion sequences at one position in a single sample +typedef struct +{ + char *nt16_seq[NI]; + int len[NI]; + int freq[NI]; +} +ins_freq_t; + +typedef struct +{ + int len[NI]; + int freq[NI]; +} +del_freq_t; + +#define BF_DEL 5 +typedef struct +{ + int base[6]; // frequencies of A,C,G,T,N,deletion +} +base_freq_t; + + +// Candidate variants for each interesting position to build consensus haplotypes +enum variant_type { snv, ins, del, done }; +typedef struct +{ + enum variant_type vtype; + hts_pos_t pos; // variant position (reference sequence coordinates), indels follow VCF convention + int idx; // temporary 0-based index to rcns.cvar + int which, // base/ins/del in rcns.[base|ins|del]_freq array + depth; // coverage at the position + float af, af_dev; // variant allele frequency (just for debugging printout) and absolute af deviation from 0.5 +} +candidate_var_t; +static inline int cvar_not_preferred(candidate_var_t *a, candidate_var_t *b) +{ + if ( a->af_dev == b->af_dev ) return a->depth < b->depth ? 1 : 0; + return a->af_dev > b->af_dev ? 1 : 0; +} +KHEAP_INIT(cvh, candidate_var_t, cvar_not_preferred); +typedef khp_cvh_t cvar_heap_t; + +#define MAX_NCVAR 8 // This results in alloc() of 2^MAX_NCVAR possible haplotypes +#define NHAP (1<mfreq; i++) + { + ins_freq_t *ifrq = &rcns->ins_freq[i]; + for (j=0; jnt16_seq[j]; j++) free(ifrq->nt16_seq[j]); + } + for (i=0; i<2; i++) + free(rcns->cns[i].seq); + free(rcns->ins_freq); + free(rcns->del_freq); + free(rcns->base_freq); + free(rcns->stmp); + khp_destroy(cvh,rcns->cv_heap); + free(rcns); +} +static int init_arrays(read_cns_t *rcns) +{ + int i,j,n = rcns->end - rcns->beg + 1; + if ( n > rcns->mfreq ) + { + ins_freq_t *ifrq = (ins_freq_t*) realloc(rcns->ins_freq,sizeof(*rcns->ins_freq)*n); + if ( !ifrq ) return -1; + rcns->ins_freq = ifrq; + memset(ifrq+rcns->mfreq,0,sizeof(*rcns->ins_freq)*(n-rcns->mfreq)); + + del_freq_t *dfrq = (del_freq_t*) realloc(rcns->del_freq,sizeof(*rcns->del_freq)*n); + if ( !dfrq ) return -1; + rcns->del_freq = dfrq; + memset(dfrq+rcns->mfreq,0,sizeof(*rcns->del_freq)*(n-rcns->mfreq)); + + base_freq_t *bfrq = (base_freq_t*) realloc(rcns->base_freq,sizeof(*rcns->base_freq)*n); + if ( !bfrq ) return -1; + rcns->base_freq = bfrq; + memset(bfrq+rcns->mfreq,0,sizeof(*rcns->base_freq)*(n-rcns->mfreq)); + + rcns->mfreq = n; + } + memset(rcns->base_freq,0,sizeof(*rcns->base_freq)*n); + memset(rcns->del_freq,0,sizeof(*rcns->del_freq)*n); + for (i=0; iins_freq[i]; + for (j=0; jnt16_seq[j]; j++) free(ifrq->nt16_seq[j]); + } + memset(rcns->ins_freq,0,sizeof(*rcns->ins_freq)*n); + return 0; +} +int rcns_reset(read_cns_t *rcns, hts_pos_t pos, hts_pos_t beg, hts_pos_t end) +{ + rcns->band = 0; + rcns->pos = pos; + rcns->beg = beg; + rcns->end = end; + int i; + for (i=0; i<2; i++) rcns->cns[i].nseq = rcns->cns[i].ipos = 0; + // this should not be necessary if the caller did run all steps + while (rcns->cv_heap->ndat) khp_delete(cvh, rcns->cv_heap); + return init_arrays(rcns); +} + +static inline void add_base(read_cns_t *rcns, int ref_pos, int nt16) +{ + int i = ref_pos - rcns->beg; + rcns->base_freq[i].base[seq_nt16_int[nt16]]++; +} +static void add_ins(read_cns_t *rcns, int ref_pos, int seq_pos, uint8_t *raw_seq, int len) +{ + int i = ref_pos - rcns->beg; + ins_freq_t *ifrq = &rcns->ins_freq[i]; + char *str; + if ( rcns->mstmp < len ) + { + str = realloc(rcns->stmp,len*sizeof(*str)); + if ( !str ) return; + rcns->mstmp = len; + rcns->stmp = str; + } + else + str = rcns->stmp; + for (i=0; int16_seq[i]; i++) + if ( ifrq->len[i]==len && !memcmp(ifrq->nt16_seq[i],str,len) ) break; + + if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard + + if ( !ifrq->nt16_seq[i] ) // new insertion + { + if ( !(ifrq->nt16_seq[i]=malloc(len)) ) return; + memcpy(ifrq->nt16_seq[i], str, len); + ifrq->len[i] = len; + } + ifrq->freq[i]++; +} +static void add_del(read_cns_t *rcns, int ref_pos, int len) +{ + int i = ref_pos - rcns->beg; + int j,n = rcns->end - rcns->beg + 1; + if ( i + len + 1 < n ) n = i + len + 1; + for (j=i+1; jbase_freq[j].base[BF_DEL]++; + + del_freq_t *dfrq = &rcns->del_freq[i]; + for (i=0; ilen[i]; i++) + if ( dfrq->len[i]==len ) break; + + if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard + + if ( !dfrq->len[i] ) dfrq->len[i] = len; // new deletion + dfrq->freq[i]++; +} + +read_cns_t *rcns_init(hts_pos_t pos, hts_pos_t beg, hts_pos_t end) +{ + read_cns_t *rcns = (read_cns_t*) calloc(1,sizeof(read_cns_t)); + rcns->pos = pos; + rcns->beg = beg; + rcns->end = end; + rcns->cv_heap = khp_init(cvh); + if ( init_arrays(rcns)!=0 ) + { + rcns_destroy(rcns); + return NULL; + } + return rcns; +} + +int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp) +{ + // save the reads for phasing, this can be called multiple times + rcns->plp = plp; + rcns->nplp = nplp; + + // fill consensus arrays + int i,j,k, local_band_max = 0; // maximum absolute deviation from diagonal + for (i=0; ib; + int x = b->core.pos; // ref coordinate + int y = 0; // seq coordinate + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + int local_band = 0; // current deviation from diagonal + for (k = 0; k < b->core.n_cigar; ++k) + { + int op = cigar[k] & BAM_CIGAR_MASK; + int len = cigar[k] >> BAM_CIGAR_SHIFT; + if ( op==BAM_CSOFT_CLIP ) y += len; + else if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF ) + { + if ( xend && x+len>rcns->beg ) + { + int j_beg = rcns->beg > x ? rcns->beg - x : 0; // how many bases to skip in the ref and qry + int j_end = rcns->end < x + len - 1 ? rcns->end - x : len - 1; + x += j_beg; + y += j_beg; + for (j=j_beg; j<=j_end; j++, x++, y++) add_base(rcns,x,bam_seqi(seq,y)); + } + else + { + x += len; + y += len; + } + } + else if ( op==BAM_CINS ) + { + if ( x>rcns->beg && xend ) + { + local_band += p->indel; + add_ins(rcns,x-1,y,seq,len); // x-1: one base before as in VCF + } + y += len; + } + else if ( op==BAM_CDEL ) + { + if ( x>rcns->beg && x+len-1<=rcns->end ) + { + local_band += -p->indel; + add_del(rcns,x-1,len); // x-1: one base before as in VCF + } + x += len; + } + else if ( op==BAM_CHARD_CLIP ) continue; + else error("rcns_set_reads todo: unknown cigar operator %d\n",op); + if ( local_band_max < local_band ) local_band_max = local_band; + } + + // Track the biggest deviation +/- from diagonal, used in BAQ alignment step. + if ( rcns->band < local_band_max ) rcns->band = local_band_max; + } + + return 0; +} + +#if DEBUG_RCNS +static void debug_print_base_freqs(read_cns_t *rcns, const char *ref) +{ + int i,j,k,n = rcns->end - rcns->beg + 1; + fprintf(bcftools_stderr,"beg,end,pos=%d %d %d\n",(int)rcns->beg,(int)rcns->end,(int)rcns->pos); + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + del_freq_t *dfreq = rcns->del_freq; + for (i=0; ibeg+i+1,ref[i]); + for (j=0; j<6; j++) + fprintf(bcftools_stderr,"\t%d%s",bfreq[i].base[j],ref[i]=="ACGTNi"[j]?"*":""); + fprintf(bcftools_stderr,"\t"); + for (j=0; jncvar; i++) + { + candidate_var_t *var = &rcns->cvar[i]; + fprintf(bcftools_stderr,"\tvar%d pos=%"PRIhts_pos" idx=%d vtype=%s which=%d depth=%d af=%f af_dev=%f\n", + i,var->pos+1,var->idx,vtype2string(var->vtype),var->which,var->depth,var->af,var->af_dev); + } +} +static void debug_print_haplotype_frequency_spectrum(read_cns_t *rcns) +{ + int i,j; + fprintf(bcftools_stderr,"Haplotype frequencies (bits from left correspond to var0,1,..):\n"); + for (i=0; ihap_freq[i] ) continue; + fprintf(bcftools_stderr,"\t%d: ",i); + for (j=0; jncvar; j++) + fprintf(bcftools_stderr,"%d", i&(1<hap_freq[i]); + } +} +static void debug_print_consensus(read_cns_t *rcns, const char *ref) +{ + int i,j,n = rcns->end - rcns->beg + 1; + fprintf(bcftools_stderr,"ref: "); + for (i=0; icns[i].nseq ) break; + fprintf(bcftools_stderr,"Consensus%d: ",i); + for (j=0; j<=rcns->cns[i].ipos; j++) + fprintf(bcftools_stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]); + fprintf(bcftools_stderr,"#"); + for (; jcns[i].nseq; j++) + fprintf(bcftools_stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]); + fprintf(bcftools_stderr,"\n"); + } +} +#else +#define debug_print_base_freqs(rcns,ref) +#define debug_print_candidate_variants(rcns) +#define debug_print_haplotype_frequency_spectrum(rcns) +#define debug_print_consensus(rcns,ref) +#endif + +static int cvar_pos_cmp(const void *aptr, const void *bptr) +{ + candidate_var_t *a = (candidate_var_t*)aptr; + candidate_var_t *b = (candidate_var_t*)bptr; + if ( a->pos < b->pos ) return -1; + if ( a->pos > b->pos ) return 1; + if ( a->vtype < b->vtype ) return -1; + if ( a->vtype > b->vtype ) return 1; + if ( a->which < b->which ) return -1; + if ( a->which > b->which ) return 1; + return 0; +} +static void register_variant(read_cns_t *rcns, enum variant_type vtype, int cns_pos, int which, int depth, float freq) +{ + cvar_heap_t *cv_heap = rcns->cv_heap; + if ( vtype==done ) + { + rcns->ncvar = 0; + while (cv_heap->ndat) + { + rcns->cvar[rcns->ncvar++] = cv_heap->dat[0]; + khp_delete(cvh,cv_heap); + } + // sort the variants by pos,type,which to make determination of haplotypes from reads faster + if ( rcns->ncvar ) + qsort(rcns->cvar, rcns->ncvar, sizeof(*rcns->cvar), cvar_pos_cmp); + return; + } + + candidate_var_t var; + var.pos = cns_pos + rcns->beg; + var.which = which; + var.vtype = vtype; + var.depth = depth; + var.af_dev = fabs(0.5-freq); + var.af = freq; + + int free_slot; + + // keep the number of variants small, maximum MAX_NCVAR + if ( rcns->ncvar==MAX_NCVAR ) + { + if ( cvar_not_preferred(&var,&cv_heap->dat[0]) ) return; // no need to add, the new variant is worse than the heap's worst one + free_slot = cv_heap->dat[0].idx; + khp_delete(cvh,cv_heap); + } + else + free_slot = rcns->ncvar++; + var.idx = free_slot; + rcns->cvar[free_slot] = var; + khp_insert(cvh,cv_heap,&var); +} + +// Identify candidate variant positions. (Note that homozygous variants are not considered +// as those will be added trivially by taking the consensus base.) The detection limit is +// for now hard-wired. This has only indirect effect on sensitivity, will just not contribute +// to the consensus template when realigning. +static int select_candidate_variants(read_cns_t *rcns, const char *ref) +{ + const float af_th = 0.1; + int i,j, n = rcns->end - rcns->beg + 1; + int max_ins_len = 0; // maximum total length of all insertions applied to allocate big enough buffers + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + del_freq_t *dfreq = rcns->del_freq; + for (i=0; ipos - rcns->beg ) continue; // creating consensus from everything but the variants at the current position + + int dp = 0; + for (j=0; j<4; j++) dp += bfreq[i].base[j]; + for (j=0; jaf_th && af<(1-af_th) ) register_variant(rcns,snv,i,j,dp,af); + } + for (j=0; jaf_th && af<(1-af_th) ) register_variant(rcns,del,i,j,dp,af); + } + for (j=0; jaf_th && af<(1-af_th) ) register_variant(rcns,ins,i,j,dp,af); + } + } + register_variant(rcns,done,0,0,0,0); // finalize + + // Reallocate buffers + if ( rcns->mcns < n + max_ins_len ) + { + n += max_ins_len; + for (i=0; i<2; i++) + { + char *seq = (char*) realloc(rcns->cns[i].seq,sizeof(char)*n); + if ( !seq ) return -1; + rcns->cns[i].seq = seq; + } + rcns->mcns = n; + } + + // Find the longest deletion at the query position + i = rcns->pos - rcns->beg; + rcns->max_del = 0; + for (j=0; jmax_del < dfreq[i].len[j] ) rcns->max_del = dfreq[i].len[j]; + } + + return 0; +} +static int create_haplotype_frequency_spectrum(read_cns_t *rcns) +{ + memset(rcns->hap_freq,0,sizeof(rcns->hap_freq)); + + int i; + for (i=0; inplp; i++) // for each read... + { + const bam_pileup1_t *p = rcns->plp + i; + cigar_state_t cigar; + cstate_init(&cigar,p->b); + + int j,k,hap = 0; + for (j=0; jncvar; j++) + { + candidate_var_t *cvar = &rcns->cvar[j]; + if ( cvar->vtype==snv ) + { + int iseq = cstate_seek_op_fwd(&cigar, cvar->pos, BAM_CMATCH, NULL); + if ( iseq==-2 ) break; + if ( iseq==-1 ) continue; + int nt16 = bam_seqi(cigar.seq, iseq); + if ( seq_nt16_int[nt16]==cvar->which ) hap |= 1<vtype==ins ) + { + int len; + ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg]; + int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len); + if ( iseq==-2 ) break; + if ( iseq==-1 ) continue; + if ( len!=ifrq->len[cvar->which] ) continue; + for (k=0; klen[cvar->which]; k++) + if ( bam_seqi(cigar.seq,iseq+k)!=ifrq->nt16_seq[cvar->which][k] ) break; + if ( k==ifrq->len[cvar->which] ) hap |= 1<vtype==del ) + { + int len; + del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg]; + int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len); + if ( ret==-2 ) break; + if ( ret==-1 ) continue; + if ( len!=dfrq->len[cvar->which] ) continue; + hap |= 1<hap_freq[hap]++; + } + return 0; +} + +typedef struct +{ + int haplotype, count; +} +ii_t; + +static int ii_cmp(const void *a, const void *b) +{ + if ( ((ii_t*)a)->count > ((ii_t*)b)->count ) return -1; + if ( ((ii_t*)a)->count < ((ii_t*)b)->count ) return 1; + return 0; +} + +// Select two most common haplotypes trying to account for 1bp errors. Haplotypes +// are represented as 8-bit numbers, each bit corresponds to one candidate variant. +static int correct_haplotype_errors(read_cns_t *rcns) +{ + int i,j, tot = 0; + ii_t freq[NHAP]; + for (i=0; ihap_freq[i]; + tot += rcns->hap_freq[i]; + } + qsort(freq, NHAP, sizeof(ii_t), ii_cmp); // sort haplotypes in descending order + for (i=NHAP-1; i>1; i--) + { + if ( !freq[i].count ) continue; + if ( freq[1].count > tot - freq[0].count - freq[1].count ) break; // the top2 hapotypes cannot change anymore + + // Find a similar haplotype with the highest frequency. Assuming errors go in 0->1 + // direction only and considering one error only. + int count = freq[i].count, max_hap = 0; + for (j=0; j=0 && haphap_freq[hap] ) count = rcns->hap_freq[hap], max_hap = hap; + } + if ( count == freq[i].count ) continue; + + // Update frequency and sort the two modified elements + count = freq[i].count; + freq[i].count = 0; + rcns->hap_freq[freq[i].haplotype] = 0; + rcns->hap_freq[max_hap] += count; + for (j=i+1; j=0; j--) + { + if ( freq[j].haplotype==max_hap ) freq[j].count += count; // update the best matching haplotype + if ( freq[j].count < freq[j+1].count ) + { + ii_t tmp = freq[j]; freq[j] = freq[j+1]; freq[j+1] = tmp; + } + } + } + + // Use only one consensus if the next best haplotype is populated by less than 10% of reads + rcns->ncns = ((float)freq[1].count / (freq[0].count + freq[1].count) < 0.1) ? 1 : 2; + + // Remove unused candidate variants from the top two haplotypes + int hap0 = freq[0].haplotype; + int hap1 = rcns->ncns==2 ? freq[1].haplotype : 0; + rcns->cns_hap[0] = 0; + rcns->cns_hap[1] = 0; + for (i=0,j=0; icvar[j] = rcns->cvar[i]; + if ( hap0 & (1U<cns_hap[0] |= 1U<cns_hap[1] |= 1U<ncvar = j; + +#if DEBUG_RCNS + // This only matters for debugging print + memset(rcns->hap_freq,0,NHAP*sizeof(*rcns->hap_freq)); + rcns->hap_freq[rcns->cns_hap[1]] = freq[1].count; // NB: the order matters when ncns==1 + rcns->hap_freq[rcns->cns_hap[0]] = freq[0].count; +#endif + + return 0; +} + + +// Check how frequent are insertions adjacent to the j-th position. Note that reads with an +// insertion usually increment also bfreq counts at this position, but not necessarily so, +// therefore the counts are approximate +static inline void apply_consensus_insertion(read_cns_t *rcns, cns_seq_t *cns, int j, int ivar) +{ + // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos + hts_pos_t ref_pos = rcns->beg + j; + if ( rcns->pos == ref_pos ) return; + + // Only apply when there is no insertion at this position registered as a variant + while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos == ref_pos ) + { + if ( rcns->cvar[ivar].vtype == ins ) return; + ivar++; + } + + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + int k, nreads = 0; + for (k=0; k max_freq*2 ) return; + + int len = ifreq[j].len[kmax]; + char *seq = ifreq[j].nt16_seq[kmax]; + for (k=0; kseq[cns->nseq++] = seq_nt16_int[(int)seq[k]]; +} + +// For each position of the realignment window apply either the candidate variants +// from ith haplotype or decide on the base/ins/del by majority vote +static void create_consensus(read_cns_t *rcns, const char *ref, int ith) +{ + int n = rcns->end - rcns->beg + 1; + cns_seq_t *cns = &rcns->cns[ith]; + base_freq_t *bfreq = rcns->base_freq; + ins_freq_t *ifreq = rcns->ins_freq; + del_freq_t *dfreq = rcns->del_freq; + hts_pos_t prev_pos = 0; + int j,k, ivar = 0; + for (j=0; jbeg + j; + if ( rcns->pos == ref_pos ) cns->ipos = cns->nseq; + + while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos < ref_pos ) ivar++; + + if ( ivar >= rcns->ncvar || rcns->cvar[ivar].pos != ref_pos ) + { + // This position is not recognised as a het variant so take the most frequent base, including + // a deletion if that is most frequent. However, for deleted bases make sure they are not part + // of the deletion that is being tested at this positions + int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]]; + int nk = ( ref_pos < rcns->pos || ref_pos > rcns->pos + rcns->max_del ) ? BF_DEL+1 : BF_DEL; + for (k=0; kseq[cns->nseq++] = kmax; + } + // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos + apply_consensus_insertion(rcns, cns, j, ivar); + continue; + } + int which = rcns->cvar[ivar].which; + if ( !(rcns->cns_hap[ith] & (1U<cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue; + if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k; + } + if ( kmax!=BF_DEL && (!cns->nseq || prev_pos != ref_pos) ) + { + prev_pos = ref_pos; + cns->seq[cns->nseq++] = kmax; + } + apply_consensus_insertion(rcns, cns, j, ivar); + continue; + } + if ( rcns->cvar[ivar].vtype == snv ) + { + prev_pos = ref_pos; + cns->seq[cns->nseq++] = which; + apply_consensus_insertion(rcns, cns, j, ivar); + continue; + } + + // There can be multiple variants at this position, for example snv+ins. SNVs come first + // thanks to cvar_pos_cmp(), make sure the base has not been added already. + if ( !cns->nseq || prev_pos != ref_pos ) + { + int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]]; + for (k=0; k<6; k++) + { + if ( rcns->cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue; + if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k; + } + if ( kmax!=BF_DEL ) + { + prev_pos = ref_pos; + cns->seq[cns->nseq++] = kmax; + } + } + if ( rcns->cvar[ivar].vtype == ins ) + { + int len = ifreq[j].len[which]; + char *seq = ifreq[j].nt16_seq[which]; + for (k=0; kseq[cns->nseq++] = seq_nt16_int[(int)seq[k]]; + } + } + else if ( rcns->cvar[ivar].vtype == del ) j += dfreq[j].len[which]; + } +} + +// The algorithm: +// 1. Identify heterozygous variant positions +// 2. Sort variants by abs(variant_allele_freq-0.5) in descending order +// 3. Take the top sorted variants (up to 8 to fit in uint8_t) and count the number of +// corresponding reads to create frequency spectrum +// 4. Correct errors, collapse to the requested number of haplotypes (consensus sequences) +// using majority vote for the distribution tail +cns_seq_t *rcns_get_consensus(read_cns_t *rcns, const char *ref) +{ + debug_print_base_freqs(rcns, ref); + + select_candidate_variants(rcns, ref); + debug_print_candidate_variants(rcns); + + if ( rcns->ncvar ) + { + create_haplotype_frequency_spectrum(rcns); + debug_print_haplotype_frequency_spectrum(rcns); + + correct_haplotype_errors(rcns); + debug_print_candidate_variants(rcns); + debug_print_haplotype_frequency_spectrum(rcns); + } + else + { + rcns->cns_hap[0] = 0; + rcns->ncns = 1; + } + + // create consensus + int i; + for (i=0; incns; i++) create_consensus(rcns,ref,i); + debug_print_consensus(rcns,ref); + + return rcns->cns; +} diff --git a/bcftools/read_consensus.h b/bcftools/read_consensus.h new file mode 100644 index 0000000..9f5cb79 --- /dev/null +++ b/bcftools/read_consensus.h @@ -0,0 +1,63 @@ +/* read_consensus.h -- create and maintain consensus of reads + + Copyright (C) 2022 Genome Research Ltd. + + Author: pd3@sanger + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. */ + +#ifndef READ_CONSENSUS_H +#define READ_CONSENSUS_H + +#include +#include +#include + +#ifndef DEBUG_RCNS +#define DEBUG_RCNS 0 +#endif + +typedef struct +{ + char *seq; // nt5 sequence: "ACGTN"[(int)seq[i]] + int nseq, ipos; // the sequence length and the `pos` index relative to seq +} +cns_seq_t; + +typedef struct _read_cns_t read_cns_t; + +// Init and destroy read consensus +read_cns_t *rcns_init(hts_pos_t pos, hts_pos_t beg, hts_pos_t end); +void rcns_destroy(read_cns_t *rcns); + +// Reset the structures for new sample and/or position +int rcns_reset(read_cns_t *rcns, hts_pos_t pos, hts_pos_t beg, hts_pos_t end); + +// Add reads to consensus. The provided structures must continue to exist +// until rcns_get_consensus() is called. +// +// Todo (easy): allow it to be called once or multiple times, eg for +// creating a shared consensus for multiple samples +int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp); + +// Generate up to two consensus sequences, cns_seq[1].nseq is 0 when only +// the first is set +cns_seq_t *rcns_get_consensus(read_cns_t *rcns, const char *ref); + +#endif diff --git a/bcftools/smpl_ilist.h b/bcftools/smpl_ilist.h index 79292c3..e273ac4 100644 --- a/bcftools/smpl_ilist.h +++ b/bcftools/smpl_ilist.h @@ -1,4 +1,4 @@ -/* +/* Copyright (C) 2016-2021 Genome Research Ltd. Author: Petr Danecek @@ -9,10 +9,10 @@ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -35,7 +35,7 @@ #define SMPL_SINGLE 2 // single sample expected #define SMPL_PAIR1 4 // two samples expected, the first is from the bcf hdr #define SMPL_PAIR2 8 // two samples expected, the second is from the bcf hdr -#define SMPL_VERBOSE 16 // print warnings +#define SMPL_VERBOSE 16 // print warnings #define SMPL_REORDER 32 // reorder samples as asked, sample_list[i] points to the VCF header index typedef struct @@ -46,6 +46,7 @@ typedef struct } smpl_ilist_t; +// Pass NULL for sample_list to get all samples smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags); smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags); void smpl_ilist_destroy(smpl_ilist_t *smpl); diff --git a/bcftools/vcfannotate.c b/bcftools/vcfannotate.c index d33fd90..495d2b5 100644 --- a/bcftools/vcfannotate.c +++ b/bcftools/vcfannotate.c @@ -2022,7 +2022,7 @@ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); for (i=0; insample_map; i++) args->sample_map[i] = -1; - int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file + int flags = !src ? SMPL_STRICT|SMPL_SINGLE|SMPL_REORDER : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is tab vs vcf annotation file smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names); args->nsmpl_annot = ilist->n; @@ -3427,6 +3427,7 @@ int main_vcfannotate(int argc, char *argv[]) case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break; case 1 : args->rename_chrs = optarg; break; case 2 : + if ( args->pair_logic==-1 ) args->pair_logic = 0; if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF; else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF; else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF; diff --git a/bcftools/vcfannotate.c.pysam.c b/bcftools/vcfannotate.c.pysam.c index e45c305..54f6a39 100644 --- a/bcftools/vcfannotate.c.pysam.c +++ b/bcftools/vcfannotate.c.pysam.c @@ -2024,7 +2024,7 @@ static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst) args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map); for (i=0; insample_map; i++) args->sample_map[i] = -1; - int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file + int flags = !src ? SMPL_STRICT|SMPL_SINGLE|SMPL_REORDER : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is tab vs vcf annotation file smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names); args->nsmpl_annot = ilist->n; @@ -3429,6 +3429,7 @@ int main_vcfannotate(int argc, char *argv[]) case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break; case 1 : args->rename_chrs = optarg; break; case 2 : + if ( args->pair_logic==-1 ) args->pair_logic = 0; if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF; else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF; else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF; diff --git a/bcftools/vcfcall.c b/bcftools/vcfcall.c index 4c9e88c..1cd6f50 100644 --- a/bcftools/vcfcall.c +++ b/bcftools/vcfcall.c @@ -568,9 +568,9 @@ bcf1_t *next_line(args_t *args) memset(&rec_tgt,0,sizeof(rec_tgt)); regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr); regitr_t *tmp_itr = regitr_init(args->tgt_idx); - regitr_copy(tmp_itr, args->tgt_itr); for (i=0; itgt_itr); rec = vcfbuf_peek(args->vcfbuf, i); int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1; while ( regitr_overlap(tmp_itr) ) diff --git a/bcftools/vcfcall.c.pysam.c b/bcftools/vcfcall.c.pysam.c index c715c53..975247c 100644 --- a/bcftools/vcfcall.c.pysam.c +++ b/bcftools/vcfcall.c.pysam.c @@ -570,9 +570,9 @@ bcf1_t *next_line(args_t *args) memset(&rec_tgt,0,sizeof(rec_tgt)); regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr); regitr_t *tmp_itr = regitr_init(args->tgt_idx); - regitr_copy(tmp_itr, args->tgt_itr); for (i=0; itgt_itr); rec = vcfbuf_peek(args->vcfbuf, i); int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1; while ( regitr_overlap(tmp_itr) ) diff --git a/bcftools/vcfconcat.c b/bcftools/vcfconcat.c index 0246b59..74fd036 100644 --- a/bcftools/vcfconcat.c +++ b/bcftools/vcfconcat.c @@ -1018,8 +1018,8 @@ int main_vcfconcat(int argc, char *argv[]) if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 'v': - args->verbose = strtol(optarg, 0, 0); - error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); + args->verbose = strtol(optarg, &tmp, 0); + if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); break; case 'h': case '?': usage(args); break; diff --git a/bcftools/vcfconcat.c.pysam.c b/bcftools/vcfconcat.c.pysam.c index e2cd43f..e1baeef 100644 --- a/bcftools/vcfconcat.c.pysam.c +++ b/bcftools/vcfconcat.c.pysam.c @@ -1020,8 +1020,8 @@ int main_vcfconcat(int argc, char *argv[]) if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg); break; case 'v': - args->verbose = strtol(optarg, 0, 0); - error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); + args->verbose = strtol(optarg, &tmp, 0); + if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n"); break; case 'h': case '?': usage(args); break; diff --git a/bcftools/vcfmerge.c b/bcftools/vcfmerge.c index 7296a00..621f410 100644 --- a/bcftools/vcfmerge.c +++ b/bcftools/vcfmerge.c @@ -495,7 +495,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf else if ( var_len==BCF_VL_G ) { args->maux->nagr_map = bcf_alleles2gt(line->n_allele-1,line->n_allele-1)+1; - assert( ret==line->n_allele || ret==args->maux->nagr_map ); + if ( ret!=line->n_allele && ret!=args->maux->nagr_map ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); if ( ret==line->n_allele ) // haploid { args->maux->nagr_map = line->n_allele; @@ -974,7 +974,7 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int k = 0; for (i=0; inals; i++) if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]); - assert( k==ma->nout_als ); + if ( k!=ma->nout_als ) error("Error: could not merge alleles at %s:%"PRId64", sanity check failed: %d!=%d\n",bcf_seqname(out_hdr,out),out->pos+1,k,ma->nout_als); normalize_alleles(ma->out_als, ma->nout_als); bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als); free(al_idxs); diff --git a/bcftools/vcfmerge.c.pysam.c b/bcftools/vcfmerge.c.pysam.c index 3a26cae..2231a57 100644 --- a/bcftools/vcfmerge.c.pysam.c +++ b/bcftools/vcfmerge.c.pysam.c @@ -497,7 +497,7 @@ static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, inf else if ( var_len==BCF_VL_G ) { args->maux->nagr_map = bcf_alleles2gt(line->n_allele-1,line->n_allele-1)+1; - assert( ret==line->n_allele || ret==args->maux->nagr_map ); + if ( ret!=line->n_allele && ret!=args->maux->nagr_map ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1); if ( ret==line->n_allele ) // haploid { args->maux->nagr_map = line->n_allele; @@ -976,7 +976,7 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) int k = 0; for (i=0; inals; i++) if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]); - assert( k==ma->nout_als ); + if ( k!=ma->nout_als ) error("Error: could not merge alleles at %s:%"PRId64", sanity check failed: %d!=%d\n",bcf_seqname(out_hdr,out),out->pos+1,k,ma->nout_als); normalize_alleles(ma->out_als, ma->nout_als); bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als); free(al_idxs); diff --git a/bcftools/vcfnorm.c b/bcftools/vcfnorm.c index 38c5de4..9538f8d 100644 --- a/bcftools/vcfnorm.c +++ b/bcftools/vcfnorm.c @@ -1,6 +1,6 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -102,7 +102,7 @@ typedef struct int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; - int use_star_allele; + int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; } @@ -711,11 +711,14 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int for (j=0; jma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is if ( bcf_gt_allele(gt[j])==ialt+1 ) gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT - else + else if ( args->ma_use_ref_allele ) gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF + else + gt[j] = bcf_gt_missing | bcf_gt_is_phased(gt[j]); // set to missing } gt += ngts; } @@ -723,7 +726,7 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) { - #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \ + #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end,set_missing) \ { \ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \ int ntmp = args->ntmp_arr1 / sizeof(type_t); \ @@ -762,7 +765,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int type_t *src_vals = vals, *dst_vals = vals; \ for (i=0; ihdr,BCF_HL_FMT,fmt->id)) { - case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break; - case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break; + case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[isrc]==bcf_int32_vector_end, src_vals[isrc]==bcf_int32_missing, dst_vals[idst]=bcf_int32_vector_end, dst_vals[idst]=bcf_int32_missing); break; + case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[isrc]), bcf_float_is_missing(src_vals[isrc]), bcf_float_set_vector_end(dst_vals[idst]), bcf_float_set_missing(src_vals[idst])); break; } #undef BRANCH_NUMERIC } @@ -2087,6 +2106,7 @@ static void usage(void) fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); fprintf(stderr, " --no-version Do not append version and command line to the header\n"); fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); @@ -2126,6 +2146,7 @@ int main_vcfnorm(int argc, char *argv[]) args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; args->do_indels = 1; + args->ma_use_ref_allele = 1; args->clevel = -1; int region_is_file = 0; int targets_is_file = 0; @@ -2144,6 +2165,7 @@ int main_vcfnorm(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, + {"multi-overlaps",required_argument,NULL,13}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,1}, @@ -2177,6 +2199,11 @@ int main_vcfnorm(int argc, char *argv[]) else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n"); break; case 12 : args->old_rec_tag = optarg; break; + case 13 : + if ( optarg[0]=='0' ) args->ma_use_ref_allele = 1; + else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; + else error("Invalid argument to --multi-overlaps\n"); + break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; diff --git a/bcftools/vcfnorm.c.pysam.c b/bcftools/vcfnorm.c.pysam.c index a292b96..e2d4177 100644 --- a/bcftools/vcfnorm.c.pysam.c +++ b/bcftools/vcfnorm.c.pysam.c @@ -2,7 +2,7 @@ /* vcfnorm.c -- Left-align and normalize indels. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Petr Danecek @@ -104,7 +104,7 @@ typedef struct int record_cmd_line, force, force_warned, keep_sum_ad; abuf_t *abuf; abuf_opt_t atomize; - int use_star_allele; + int use_star_allele, ma_use_ref_allele; char *old_rec_tag; htsFile *out; } @@ -713,11 +713,14 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int for (j=0; jma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is if ( bcf_gt_allele(gt[j])==ialt+1 ) gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT - else + else if ( args->ma_use_ref_allele ) gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF + else + gt[j] = bcf_gt_missing | bcf_gt_is_phased(gt[j]); // set to missing } gt += ngts; } @@ -725,7 +728,7 @@ static void split_format_genotype(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int } static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst) { - #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \ + #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end,set_missing) \ { \ const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \ int ntmp = args->ntmp_arr1 / sizeof(type_t); \ @@ -764,7 +767,10 @@ static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int type_t *src_vals = vals, *dst_vals = vals; \ for (i=0; ihdr,BCF_HL_FMT,fmt->id)) { - case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break; - case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break; + case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[isrc]==bcf_int32_vector_end, src_vals[isrc]==bcf_int32_missing, dst_vals[idst]=bcf_int32_vector_end, dst_vals[idst]=bcf_int32_missing); break; + case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[isrc]), bcf_float_is_missing(src_vals[isrc]), bcf_float_set_vector_end(dst_vals[idst]), bcf_float_set_missing(src_vals[idst])); break; } #undef BRANCH_NUMERIC } @@ -2089,6 +2108,7 @@ static void usage(void) fprintf(bcftools_stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n"); fprintf(bcftools_stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n"); fprintf(bcftools_stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n"); + fprintf(bcftools_stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n"); fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n"); fprintf(bcftools_stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n"); fprintf(bcftools_stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n"); @@ -2128,6 +2148,7 @@ int main_vcfnorm(int argc, char *argv[]) args->buf_win = 1000; args->mrows_collapse = COLLAPSE_BOTH; args->do_indels = 1; + args->ma_use_ref_allele = 1; args->clevel = -1; int region_is_file = 0; int targets_is_file = 0; @@ -2146,6 +2167,7 @@ int main_vcfnorm(int argc, char *argv[]) {"fasta-ref",required_argument,NULL,'f'}, {"do-not-normalize",no_argument,NULL,'N'}, {"multiallelics",required_argument,NULL,'m'}, + {"multi-overlaps",required_argument,NULL,13}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"regions-overlap",required_argument,NULL,1}, @@ -2179,6 +2201,11 @@ int main_vcfnorm(int argc, char *argv[]) else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n"); break; case 12 : args->old_rec_tag = optarg; break; + case 13 : + if ( optarg[0]=='0' ) args->ma_use_ref_allele = 1; + else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0; + else error("Invalid argument to --multi-overlaps\n"); + break; case 'N': args->do_indels = 0; break; case 'd': if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS; diff --git a/bcftools/vcfquery.c b/bcftools/vcfquery.c index 70b5f30..889f363 100644 --- a/bcftools/vcfquery.c +++ b/bcftools/vcfquery.c @@ -125,6 +125,7 @@ static void query_vcf(args_t *args) } int i,max_convert_unpack = convert_max_unpack(args->convert); + int max_filter_unpack = args->filter ? filter_max_unpack(args->filter) : 0; while ( bcf_sr_next_line(args->files) ) { if ( !bcf_sr_has_line(args->files,0) ) continue; @@ -143,7 +144,7 @@ static void query_vcf(args_t *args) if ( pass ) { if ( !args->smpl_pass ) continue; - if ( !(max_convert_unpack & BCF_UN_FMT) ) continue; + if ( !(max_convert_unpack & BCF_UN_FMT) && !(max_filter_unpack & BCF_UN_FMT) ) continue; pass = 0; for (i=0; in_sample; i++) @@ -292,7 +293,7 @@ int main_vcfquery(int argc, char *argv[]) case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; - case 'c': + case 'c': error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n"); break; case 'a': diff --git a/bcftools/vcfquery.c.pysam.c b/bcftools/vcfquery.c.pysam.c index 5ffbd64..f1e0f8b 100644 --- a/bcftools/vcfquery.c.pysam.c +++ b/bcftools/vcfquery.c.pysam.c @@ -127,6 +127,7 @@ static void query_vcf(args_t *args) } int i,max_convert_unpack = convert_max_unpack(args->convert); + int max_filter_unpack = args->filter ? filter_max_unpack(args->filter) : 0; while ( bcf_sr_next_line(args->files) ) { if ( !bcf_sr_has_line(args->files,0) ) continue; @@ -145,7 +146,7 @@ static void query_vcf(args_t *args) if ( pass ) { if ( !args->smpl_pass ) continue; - if ( !(max_convert_unpack & BCF_UN_FMT) ) continue; + if ( !(max_convert_unpack & BCF_UN_FMT) && !(max_filter_unpack & BCF_UN_FMT) ) continue; pass = 0; for (i=0; in_sample; i++) @@ -294,7 +295,7 @@ int main_vcfquery(int argc, char *argv[]) case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; - case 'c': + case 'c': error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n"); break; case 'a': diff --git a/bcftools/vcfsort.c b/bcftools/vcfsort.c index a8052d0..1de2b28 100644 --- a/bcftools/vcfsort.c +++ b/bcftools/vcfsort.c @@ -1,19 +1,19 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2021 Genome Research Ltd. + Copyright (C) 2017-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -77,7 +77,7 @@ void clean_files(args_t *args) unlink(blk->fname); free(blk->fname); } - if ( blk->rec ) + if ( blk->rec ) bcf_destroy(blk->rec); } rmdir(args->tmp_dir); @@ -107,7 +107,7 @@ int cmp_bcf_pos(const void *aptr, const void *bptr) int i; for (i=0; in_allele; i++) - { + { if ( i >= b->n_allele ) return 1; int ret = strcasecmp(a->d.allele[i],b->d.allele[i]); if ( ret ) return ret; @@ -124,6 +124,7 @@ void buf_flush(args_t *args) args->nblk++; args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk); + if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk); blk_t *blk = args->blk + args->nblk - 1; kstring_t str = {0,0,0}; @@ -135,7 +136,7 @@ void buf_flush(args_t *args) htsFile *fh = hts_open(blk->fname, "wbu"); if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - + int i; for (i=0; inbuf; i++) { @@ -226,7 +227,7 @@ void buf_push(args_t *args, bcf1_t *rec) bcf_destroy(rec); } -void sort_blocks(args_t *args) +void sort_blocks(args_t *args) { htsFile *in = hts_open(args->fname, "r"); if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname); @@ -278,7 +279,7 @@ void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) khp_insert(blk, bhp, &blk); } -void merge_blocks(args_t *args) +void merge_blocks(args_t *args) { fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk); khp_blk_t *bhp = khp_init(blk); @@ -336,7 +337,7 @@ static void usage(args_t *args) exit(1); } -size_t parse_mem_string(const char *str) +size_t parse_mem_string(const char *str) { char *tmp; double mem = strtod(str, &tmp); @@ -352,6 +353,7 @@ static void init(args_t *args) { args->max_mem *= 0.9; args->mem_block = malloc(args->max_mem); + if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem); args->mem = 0; args->tmp_dir = init_tmp_prefix(args->tmp_dir); diff --git a/bcftools/vcfsort.c.pysam.c b/bcftools/vcfsort.c.pysam.c index d3eb6b7..79dbc43 100644 --- a/bcftools/vcfsort.c.pysam.c +++ b/bcftools/vcfsort.c.pysam.c @@ -2,20 +2,20 @@ /* vcfsort.c -- sort subcommand - Copyright (C) 2017-2021 Genome Research Ltd. + Copyright (C) 2017-2022 Genome Research Ltd. Author: Petr Danecek - + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -79,7 +79,7 @@ void clean_files(args_t *args) unlink(blk->fname); free(blk->fname); } - if ( blk->rec ) + if ( blk->rec ) bcf_destroy(blk->rec); } rmdir(args->tmp_dir); @@ -109,7 +109,7 @@ int cmp_bcf_pos(const void *aptr, const void *bptr) int i; for (i=0; in_allele; i++) - { + { if ( i >= b->n_allele ) return 1; int ret = strcasecmp(a->d.allele[i],b->d.allele[i]); if ( ret ) return ret; @@ -126,6 +126,7 @@ void buf_flush(args_t *args) args->nblk++; args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk); + if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk); blk_t *blk = args->blk + args->nblk - 1; kstring_t str = {0,0,0}; @@ -137,7 +138,7 @@ void buf_flush(args_t *args) htsFile *fh = hts_open(blk->fname, "wbu"); if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno)); if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname); - + int i; for (i=0; inbuf; i++) { @@ -228,7 +229,7 @@ void buf_push(args_t *args, bcf1_t *rec) bcf_destroy(rec); } -void sort_blocks(args_t *args) +void sort_blocks(args_t *args) { htsFile *in = hts_open(args->fname, "r"); if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname); @@ -280,7 +281,7 @@ void blk_read(args_t *args, khp_blk_t *bhp, bcf_hdr_t *hdr, blk_t *blk) khp_insert(blk, bhp, &blk); } -void merge_blocks(args_t *args) +void merge_blocks(args_t *args) { fprintf(bcftools_stderr,"Merging %d temporary files\n", (int)args->nblk); khp_blk_t *bhp = khp_init(blk); @@ -338,7 +339,7 @@ static void usage(args_t *args) bcftools_exit(1); } -size_t parse_mem_string(const char *str) +size_t parse_mem_string(const char *str) { char *tmp; double mem = strtod(str, &tmp); @@ -354,6 +355,7 @@ static void init(args_t *args) { args->max_mem *= 0.9; args->mem_block = malloc(args->max_mem); + if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem); args->mem = 0; args->tmp_dir = init_tmp_prefix(args->tmp_dir); diff --git a/bcftools/vcfstats.c b/bcftools/vcfstats.c index 6a7272f..10189fe 100644 --- a/bcftools/vcfstats.c +++ b/bcftools/vcfstats.c @@ -1,6 +1,6 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -1881,6 +1881,7 @@ int main_vcfstats(int argc, char *argv[]) if ( args->split_by_id ) error("Only one file can be given with -i.\n"); } if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO; + else args->files->max_unpack = BCF_UN_FMT; if ( args->targets_list ) { bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); diff --git a/bcftools/vcfstats.c.pysam.c b/bcftools/vcfstats.c.pysam.c index 7b5c485..3b7da5a 100644 --- a/bcftools/vcfstats.c.pysam.c +++ b/bcftools/vcfstats.c.pysam.c @@ -2,7 +2,7 @@ /* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -1883,6 +1883,7 @@ int main_vcfstats(int argc, char *argv[]) if ( args->split_by_id ) error("Only one file can be given with -i.\n"); } if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO; + else args->files->max_unpack = BCF_UN_FMT; if ( args->targets_list ) { bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap); diff --git a/bcftools/vcfview.c b/bcftools/vcfview.c index cc02058..96dcbc7 100644 --- a/bcftools/vcfview.c +++ b/bcftools/vcfview.c @@ -1,6 +1,6 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Shane McCarthy @@ -512,7 +512,9 @@ static void usage(args_t *args) fprintf(stderr, "Subset options:\n"); fprintf(stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); fprintf(stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n"); + fprintf(stderr, " when combining filtering with sample subsetting as filtering comes (usually) first.\n"); + fprintf(stderr, " If unsure, split sample subsetting and filtering in two commands, using -Ou when piping.\n"); fprintf(stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n"); fprintf(stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(stderr, "\n"); @@ -623,7 +625,7 @@ int main_vcfview(int argc, char *argv[]) case 'l': args->clevel = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg); - args->output_type |= FT_GZ; + args->output_type |= FT_GZ; break; case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; @@ -649,7 +651,7 @@ int main_vcfview(int argc, char *argv[]) args->min_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg); break; - case 'M': + case 'M': args->max_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg); break; diff --git a/bcftools/vcfview.c.pysam.c b/bcftools/vcfview.c.pysam.c index 4bbbefb..85f483d 100644 --- a/bcftools/vcfview.c.pysam.c +++ b/bcftools/vcfview.c.pysam.c @@ -2,7 +2,7 @@ /* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files. - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: Shane McCarthy @@ -514,7 +514,9 @@ static void usage(args_t *args) fprintf(bcftools_stderr, "Subset options:\n"); fprintf(bcftools_stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n"); fprintf(bcftools_stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n"); - fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n"); + fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n"); + fprintf(bcftools_stderr, " when combining filtering with sample subsetting as filtering comes (usually) first.\n"); + fprintf(bcftools_stderr, " If unsure, split sample subsetting and filtering in two commands, using -Ou when piping.\n"); fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n"); fprintf(bcftools_stderr, " --force-samples Only warn about unknown subset samples\n"); fprintf(bcftools_stderr, "\n"); @@ -625,7 +627,7 @@ int main_vcfview(int argc, char *argv[]) case 'l': args->clevel = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg); - args->output_type |= FT_GZ; + args->output_type |= FT_GZ; break; case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; @@ -651,7 +653,7 @@ int main_vcfview(int argc, char *argv[]) args->min_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg); break; - case 'M': + case 'M': args->max_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg); break; diff --git a/bcftools/version.sh b/bcftools/version.sh index 5fbf8df..55d8042 100755 --- a/bcftools/version.sh +++ b/bcftools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.16 +VERSION=1.17 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/cy_build.py b/cy_build.py index 2726e94..59a6e12 100644 --- a/cy_build.py +++ b/cy_build.py @@ -8,7 +8,7 @@ except ImportError: from setuptools.command.build_ext import build_ext from distutils.extension import Extension -from distutils.sysconfig import get_config_vars, get_python_version +from distutils.sysconfig import get_config_var, get_config_vars, get_python_version from pkg_resources import Distribution @@ -66,7 +66,7 @@ class cy_build_ext(build_ext): # @loader_path. This will allow Python packages to find the library # in the expected place, while still giving enough flexibility to # external applications to link against the library. - relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"] + relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO')) library_path = os.path.join( "@rpath", os.path.basename(relative_module_path) ) diff --git a/doc/index.rst b/doc/index.rst index 6bff551..30474e6 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -18,7 +18,7 @@ This module provides a low-level wrapper around the htslib_ C-API as using cython and a high-level, pythonic API for convenient access to the data within genomic file formats. -The current version wraps *htslib-1.16*, *samtools-1.16.1*, and *bcftools-1.16*. +The current version wraps *htslib-1.17*, *samtools-1.17*, and *bcftools-1.17*. To install the latest release, type:: diff --git a/linker_tests/link_pre_489/cy_build.py b/linker_tests/link_pre_489/cy_build.py index fae7055..d741d49 100644 --- a/linker_tests/link_pre_489/cy_build.py +++ b/linker_tests/link_pre_489/cy_build.py @@ -8,7 +8,7 @@ except ImportError: from setuptools.command.build_ext import build_ext from distutils.extension import Extension -from distutils.sysconfig import get_config_vars, get_python_lib, get_python_version +from distutils.sysconfig import get_config_var, get_config_vars, get_python_lib, get_python_version from pkg_resources import Distribution @@ -65,7 +65,7 @@ class cy_build_ext(build_ext): # @loader_path. This will allow Python packages to find the library # in the expected place, while still giving enough flexibility to # external applications to link against the library. - relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"] + relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO')) library_path = os.path.join( "@rpath", os.path.basename(relative_module_path) ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..4106783 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[project] +name = "pysam" +description = "pysam - a python module for reading, manipulating and writing genomic data sets." +license = { text = "MIT License" } +version = "0.21.0" +authors = [ + { name = "Andreas Heger", email = "andreas.heger@gmail.com"} +] +requires-python = ">=3.6" + +dynamic = [ + "classifiers", + "readme", +] + +dependencies = [ + "cython", +] + + +[build-system] +requires = ["setuptools>=59.0", "wheel", "Cython>=0.29.30,<3.0"] +build-backend = "setuptools.build_meta:__legacy__" diff --git a/pysam/__init__.py b/pysam/__init__.py index ec52d94..9920f64 100644 --- a/pysam/__init__.py +++ b/pysam/__init__.py @@ -96,5 +96,5 @@ def get_libraries(): if pysam.config.HTSLIB == "builtin": pysam_libs.append('libchtslib') - so = sysconfig.get_config_var('SO') + so = sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO') return [os.path.join(dirname, x + so) for x in pysam_libs] diff --git a/pysam/bcftools.py b/pysam/bcftools.py index 4cbe82f..7f3c566 100644 --- a/pysam/bcftools.py +++ b/pysam/bcftools.py @@ -1,6 +1,12 @@ +try: + from typing import Final + HAVE_FINAL = True +except ImportError: + HAVE_FINAL = False + from pysam.utils import PysamDispatcher -BCFTOOLS_DISPATCH = [ +_BCFTOOLS_DISPATCH = [ "index", "annotate", "concat", @@ -24,6 +30,36 @@ BCFTOOLS_DISPATCH = [ "roh", "stats"] -# instantiate bcftools commands as python functions -for cmd in BCFTOOLS_DISPATCH: - globals()[cmd] = PysamDispatcher("bcftools", cmd, None) + +def _wrap_command(dispatch: str) -> PysamDispatcher: + return PysamDispatcher("bcftools", dispatch, ()) + + +if not HAVE_FINAL: + # instantiate bcftools commands as python functions + for cmd in _BCFTOOLS_DISPATCH: + globals()[cmd] = PysamDispatcher("bcftools", cmd, None) +else: + # python >=3.8 + index: Final[PysamDispatcher] = _wrap_command("index") + annotate: Final[PysamDispatcher] = _wrap_command("annotate") + concat: Final[PysamDispatcher] = _wrap_command("concat") + convert: Final[PysamDispatcher] = _wrap_command("convert") + isec: Final[PysamDispatcher] = _wrap_command("isec") + merge: Final[PysamDispatcher] = _wrap_command("merge") + norm: Final[PysamDispatcher] = _wrap_command("norm") + plugin: Final[PysamDispatcher] = _wrap_command("plugin") + query: Final[PysamDispatcher] = _wrap_command("query") + reheader: Final[PysamDispatcher] = _wrap_command("reheader") + sort: Final[PysamDispatcher] = _wrap_command("sort") + view: Final[PysamDispatcher] = _wrap_command("view") + head: Final[PysamDispatcher] = _wrap_command("head") + call: Final[PysamDispatcher] = _wrap_command("call") + consensus: Final[PysamDispatcher] = _wrap_command("consensus") + cnv: Final[PysamDispatcher] = _wrap_command("cnv") + csq: Final[PysamDispatcher] = _wrap_command("csq") + filter: Final[PysamDispatcher] = _wrap_command("filter") + gtcheck: Final[PysamDispatcher] = _wrap_command("gtcheck") + mpileup: Final[PysamDispatcher] = _wrap_command("mpileup") + roh: Final[PysamDispatcher] = _wrap_command("roh") + stats: Final[PysamDispatcher] = _wrap_command("stats") diff --git a/pysam/libcalignedsegment.pxd b/pysam/libcalignedsegment.pxd index 32e2c97..e14cbb1 100644 --- a/pysam/libcalignedsegment.pxd +++ b/pysam/libcalignedsegment.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 from pysam.libchtslib cimport * cdef extern from "htslib_util.h": diff --git a/pysam/libcalignedsegment.pyx b/pysam/libcalignedsegment.pyx index 810a861..75b5ee9 100644 --- a/pysam/libcalignedsegment.pyx +++ b/pysam/libcalignedsegment.pyx @@ -1,3 +1,4 @@ +# cython: language_level=3 # cython: embedsignature=True # cython: profile=True ############################################################################### @@ -62,7 +63,6 @@ import struct cimport cython from cpython cimport array as c_array -from cpython.version cimport PY_MAJOR_VERSION from cpython cimport PyBytes_FromStringAndSize from libc.string cimport memset, strchr from cpython cimport array as c_array @@ -80,21 +80,13 @@ from pysam.libcutils cimport qualities_to_qualitystring, qualitystring_to_array, cdef char * htslib_types = 'cCsSiIf' cdef char * parray_types = 'bBhHiIf' -cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 - # translation tables # cigar code to character and vice versa cdef char* CODE2CIGAR= "MIDNSHP=XB" cdef int NCIGAR_CODES = 10 -if IS_PYTHON3: - CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) - maketrans = str.maketrans -else: - CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR)) - maketrans = string.maketrans - +CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") # names for keys in dictionary representation of an AlignedSegment @@ -122,11 +114,11 @@ cdef inline uint8_t toupper(uint8_t ch): cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b): - if ch == '=': + if ch == b'=': if bam_is_rev(b): - return ',' + return b',' else: - return '.' + return b'.' else: if bam_is_rev(b): return tolower(ch) @@ -228,29 +220,29 @@ cdef inline uint8_t get_tag_typecode(value, value_type=None): if isinstance(value, int): if value < 0: if value >= INT8_MIN: - typecode = 'c' + typecode = b'c' elif value >= INT16_MIN: - typecode = 's' + typecode = b's' elif value >= INT32_MIN: - typecode = 'i' + typecode = b'i' # unsigned ints else: if value <= UINT8_MAX: - typecode = 'C' + typecode = b'C' elif value <= UINT16_MAX: - typecode = 'S' + typecode = b'S' elif value <= UINT32_MAX: - typecode = 'I' + typecode = b'I' elif isinstance(value, float): - typecode = 'f' + typecode = b'f' elif isinstance(value, str): - typecode = 'Z' + typecode = b'Z' elif isinstance(value, bytes): - typecode = 'Z' + typecode = b'Z' elif isinstance(value, array.array) or \ isinstance(value, list) or \ isinstance(value, tuple): - typecode = 'B' + typecode = b'B' else: if value_type in 'aAsSIcCZidfH': typecode = force_bytes(value_type)[0] @@ -275,7 +267,7 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): t = type(value) if t is float: - typecode = 'f' + typecode = b'f' elif t is int: if max_value is None: max_value = value @@ -284,11 +276,11 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): # signed ints if min_value < 0: if min_value >= INT8_MIN and max_value <= INT8_MAX: - typecode = 'c' + typecode = b'c' elif min_value >= INT16_MIN and max_value <= INT16_MAX: - typecode = 's' + typecode = b's' elif min_value >= INT32_MIN or max_value <= INT32_MAX: - typecode = 'i' + typecode = b'i' else: raise ValueError( "at least one signed integer out of range of " @@ -296,11 +288,11 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): # unsigned ints else: if max_value <= UINT8_MAX: - typecode = 'C' + typecode = b'C' elif max_value <= UINT16_MAX: - typecode = 'S' + typecode = b'S' elif max_value <= UINT32_MAX: - typecode = 'I' + typecode = b'I' else: raise ValueError( "at least one integer out of range of BAM/SAM specification") @@ -309,9 +301,9 @@ cdef inline uint8_t get_btag_typecode(value, min_value=None, max_value=None): if t is not bytes: value = value.encode('ascii') if len(value) == 1: - typecode = 'A' + typecode = b'A' else: - typecode = 'Z' + typecode = b'Z' return typecode @@ -357,10 +349,7 @@ cdef inline pack_tags(tags): typecode = 0 else: # only first character in valuecode matters - if IS_PYTHON3: - typecode = force_bytes(valuetype)[0] - else: - typecode = ord(valuetype[0]) + typecode = force_bytes(valuetype)[0] pytag = force_bytes(pytag) pytype = type(value) @@ -398,18 +387,11 @@ cdef inline pack_tags(tags): # use array.tostring() to retrieve byte representation and # save as bytes datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1]) - if IS_PYTHON3: - args.extend([pytag[:2], - ord("B"), - typecode, - len(value), - value.tobytes()]) - else: - args.extend([pytag[:2], - ord("B"), - typecode, - len(value), - force_bytes(value.tostring())]) + args.extend([pytag[:2], + ord("B"), + typecode, + len(value), + value.tobytes()]) else: if typecode == 0: @@ -417,13 +399,13 @@ cdef inline pack_tags(tags): if typecode == 0: raise ValueError("could not deduce typecode for value {}".format(value)) - if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H': + if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H': value = force_bytes(value) - if typecode == "a": - typecode = 'A' + if typecode == b"a": + typecode = b'A' - if typecode == 'Z' or typecode == 'H': + if typecode == b'Z' or typecode == b'H': datafmt = "2sB%is" % (len(value)+1) else: datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0] @@ -567,7 +549,7 @@ cdef inline bytes getSequenceInRange(bam1_t *src, for k from start <= k < end: # equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c) # note: do not use string literal as it will be a python string - s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf] + s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf] return charptr_to_bytes(seq) @@ -683,7 +665,7 @@ cdef inline uint32_t get_md_reference_length(char * md_tag): else: l += nmatches nmatches = 0 - if md_tag[md_idx] == '^': + if md_tag[md_idx] == b'^': md_idx += 1 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: md_idx += 1 @@ -760,7 +742,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): s_idx += 1 elif op == BAM_CDEL: for i from 0 <= i < l: - s[s_idx] = '-' + s[s_idx] = b'-' s_idx += 1 elif op == BAM_CREF_SKIP: pass @@ -786,7 +768,7 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): cdef int insertions = 0 while s[s_idx] != 0: - if s[s_idx] >= 'a': + if s[s_idx] >= b'a': insertions += 1 s_idx += 1 s_idx = 0 @@ -808,15 +790,15 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): else: # save matches up to this point, skipping insertions for x from 0 <= x < nmatches: - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 s_idx += 1 - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 r_idx += nmatches nmatches = 0 - if md_tag[md_idx] == '^': + if md_tag[md_idx] == b'^': md_idx += 1 while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90: # assert s[s_idx] == '-' @@ -836,10 +818,10 @@ cdef inline bytes build_alignment_sequence(bam1_t * src): # save matches up to this point, skipping insertions for x from 0 <= x < nmatches: - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 s_idx += 1 - while s[s_idx] >= 'a': + while s[s_idx] >= b'a': s_idx += 1 seq = PyBytes_FromStringAndSize(s, s_idx) @@ -1198,7 +1180,7 @@ cdef class AlignedSegment: cdef uint16_t x = 0 for x from l <= x < l + l_extranul: - p[x] = '\0' + p[x] = b'\0' property flag: """properties flag""" @@ -1423,8 +1405,8 @@ cdef class AlignedSegment: # as the sequence is stored in half-bytes, the total length (sequence # plus quality scores) is (l+1)/2 + l - nbytes_new = (l + 1) / 2 + l - nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq + nbytes_new = (l + 1) // 2 + l + nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq # acquire pointer to location in memory p = pysam_bam_get_seq(src) @@ -1448,7 +1430,7 @@ cdef class AlignedSegment: # convert to C string s = seq for k from 0 <= k < l: - p[k/2] |= seq_nt16_table[s[k]] << 4 * (1 - k % 2) + p[k // 2] |= seq_nt16_table[s[k]] << 4 * (1 - k % 2) # erase qualities p = pysam_bam_get_qual(src) @@ -1956,7 +1938,7 @@ cdef class AlignedSegment: return None s = force_str(self.query_sequence) if self.is_reverse: - s = s.translate(maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1] + s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1] return s def get_forward_qualities(self): @@ -2414,54 +2396,54 @@ cdef class AlignedSegment: value, value_type)) # sam_format1 for typecasting - if typecode == 'Z': + if typecode == b'Z': value = force_bytes(value) value_ptr = value value_size = len(value)+1 - elif typecode == 'H': + elif typecode == b'H': # Note that hex tags are stored the very same # way as Z string.s value = force_bytes(value) value_ptr = value value_size = len(value)+1 - elif typecode == 'A' or typecode == 'a': + elif typecode == b'A' or typecode == b'a': value = force_bytes(value) value_ptr = value value_size = sizeof(char) - typecode = 'A' - elif typecode == 'i': + typecode = b'A' + elif typecode == b'i': int32_t_value = value value_ptr = &int32_t_value value_size = sizeof(int32_t) - elif typecode == 'I': + elif typecode == b'I': uint32_t_value = value value_ptr = &uint32_t_value value_size = sizeof(uint32_t) - elif typecode == 's': + elif typecode == b's': int16_t_value = value value_ptr = &int16_t_value value_size = sizeof(int16_t) - elif typecode == 'S': + elif typecode == b'S': uint16_t_value = value value_ptr = &uint16_t_value value_size = sizeof(uint16_t) - elif typecode == 'c': + elif typecode == b'c': int8_t_value = value value_ptr = &int8_t_value value_size = sizeof(int8_t) - elif typecode == 'C': + elif typecode == b'C': uint8_t_value = value value_ptr = &uint8_t_value value_size = sizeof(uint8_t) - elif typecode == 'd': + elif typecode == b'd': double_value = value value_ptr = &double_value value_size = sizeof(double) - elif typecode == 'f': + elif typecode == b'f': float_value = value value_ptr = &float_value value_size = sizeof(float) - elif typecode == 'B': + elif typecode == b'B': # the following goes through python, needs to be cleaned up # pack array using struct fmt, args = pack_tags([(tag, value, value_type)]) @@ -2553,7 +2535,7 @@ cdef class AlignedSegment: value = bam_aux2f(v) elif auxtype == 'A' or auxtype == 'a': # force A to a - v[0] = 'A' + v[0] = b'A' # there might a more efficient way # to convert a char into a string value = '%c' % bam_aux2A(v) @@ -2609,29 +2591,29 @@ cdef class AlignedSegment: auxtag[1] = s[1] s += 2 auxtype = s[0] - if auxtype in ('c', 'C'): + if auxtype in (b'c', b'C'): value = bam_aux2i(s) s += 1 - elif auxtype in ('s', 'S'): + elif auxtype in (b's', b'S'): value = bam_aux2i(s) s += 2 - elif auxtype in ('i', 'I'): + elif auxtype in (b'i', b'I'): value = bam_aux2i(s) s += 4 - elif auxtype == 'f': + elif auxtype == b'f': value = bam_aux2f(s) s += 4 - elif auxtype == 'd': + elif auxtype == b'd': value = bam_aux2f(s) s += 8 - elif auxtype in ('A', 'a'): + elif auxtype in (b'A', b'a'): value = "%c" % bam_aux2A(s) s += 1 - elif auxtype in ('Z', 'H'): + elif auxtype in (b'Z', b'H'): value = charptr_to_str(bam_aux2Z(s)) # +1 for NULL terminated string s += len(value) + 1 - elif auxtype == 'B': + elif auxtype == b'B': s += 1 byte_size, nvalues, value = convert_binary_tag(s) # 5 for 1 char and 1 int @@ -3094,7 +3076,7 @@ cdef class PileupColumn: continue # see samtools pileup_seq if mark_ends and p.is_head: - kputc('^', buf) + kputc(b'^', buf) if p.b.core.qual > 93: kputc(126, buf) @@ -3104,42 +3086,42 @@ cdef class PileupColumn: if p.qpos < p.b.core.l_qseq: cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)] else: - cc = 'N' + cc = b'N' if mark_matches and self.reference_sequence != NULL: rb = self.reference_sequence[self.reference_pos] if seq_nt16_table[cc] == seq_nt16_table[rb]: - cc = "=" + cc = b'=' kputc(strand_mark_char(cc, p.b), buf) elif add_indels: if p.is_refskip: if bam_is_rev(p.b): - kputc('<', buf) + kputc(b'<', buf) else: - kputc('>', buf) + kputc(b'>', buf) else: - kputc('*', buf) + kputc(b'*', buf) if add_indels: if p.indel > 0: - kputc('+', buf) + kputc(b'+', buf) kputw(p.indel, buf) for j from 1 <= j <= p.indel: cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)] kputc(strand_mark_char(cc, p.b), buf) elif p.indel < 0: - kputc('-', buf) + kputc(b'-', buf) kputw(-p.indel, buf) for j from 1 <= j <= -p.indel: # TODO: out-of-range check here? if self.reference_sequence == NULL: - cc = 'N' + cc = b'N' else: cc = self.reference_sequence[self.reference_pos + j] kputc(strand_mark_char(cc, p.b), buf) if mark_ends and p.is_tail: - kputc('$', buf) + kputc(b'$', buf) - kputc(':', buf) + kputc(b':', buf) if buf.l == 0: # could be zero if all qualities are too low diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index 2a17fbe..cd0ebf8 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 75c1fa4..74637f8 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -69,7 +69,7 @@ class AlignmentHeader: def get_reference_name(self, tid: int) -> Optional[str]: ... def get_reference_length(self, reference: int) -> int: ... def is_valid_tid(self, tid: int) -> bool: ... - def get_tid(self, reference: int) -> int: ... + def get_tid(self, reference: str) -> int: ... class AlignmentFile(HTSFile): def __init__( @@ -191,7 +191,7 @@ class AlignmentFile(HTSFile): class IteratorRow: def __iter__(self) -> IteratorRow: ... - def __next__(self) -> PileupColumn: ... + def __next__(self) -> AlignedSegment: ... class IteratorRowAll(IteratorRow): ... class IteratorRowAllRefs(IteratorRow): ... diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 799258a..e37a411 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -69,17 +69,13 @@ from libc.string cimport strcmp, strpbrk, strerror from libc.stdint cimport INT32_MAX from cpython cimport array as c_array -from cpython.version cimport PY_MAJOR_VERSION from pysam.libcutils cimport force_bytes, force_str, charptr_to_str from pysam.libcutils cimport encode_filename, from_string_and_size from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn from pysam.libchtslib cimport HTSFile, hisremote -if PY_MAJOR_VERSION >= 3: - from io import StringIO -else: - from StringIO import StringIO +from io import StringIO cimport cython @@ -1649,8 +1645,11 @@ cdef class AlignmentFile(HTSFile): match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position for r in read_iterator: base_position = r.pos + cigar = r.cigartuples + if cigar is None: + continue - for op, nt in r.cigartuples: + for op, nt in cigar: if op in match_or_deletion: base_position += nt elif op == BAM_CREF_SKIP: @@ -1666,13 +1665,13 @@ cdef class AlignmentFile(HTSFile): if self.htsfile == NULL: return - cdef int ret = hts_close(self.htsfile) - self.htsfile = NULL - if self.index != NULL: hts_idx_destroy(self.index) self.index = NULL + cdef int ret = hts_close(self.htsfile) + self.htsfile = NULL + self.header = None if ret < 0: @@ -1685,14 +1684,14 @@ cdef class AlignmentFile(HTSFile): def __dealloc__(self): cdef int ret = 0 - if self.htsfile != NULL: - ret = hts_close(self.htsfile) - self.htsfile = NULL - if self.index != NULL: hts_idx_destroy(self.index) self.index = NULL + if self.htsfile != NULL: + ret = hts_close(self.htsfile) + self.htsfile = NULL + self.header = None if self.b: @@ -2047,8 +2046,8 @@ cdef class IteratorRow: def __dealloc__(self): bam_destroy1(self.b) if self.owns_samfile: - hts_close(self.htsfile) hts_idx_destroy(self.index) + hts_close(self.htsfile) cdef class IteratorRowRegion(IteratorRow): diff --git a/pysam/libcbcf.pxd b/pysam/libcbcf.pxd index 6508994..0cd54a5 100644 --- a/pysam/libcbcf.pxd +++ b/pysam/libcbcf.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 ############################################################################### ############################################################################### ## Cython wrapper for htslib VCF/BCF reader/writer diff --git a/pysam/libcbcf.pyx b/pysam/libcbcf.pyx index c3cf8cf..8c088af 100644 --- a/pysam/libcbcf.pyx +++ b/pysam/libcbcf.pyx @@ -1,3 +1,4 @@ +# cython: language_level=3 # cython: embedsignature=True # cython: profile=True ############################################################################### @@ -96,7 +97,6 @@ from cpython.dict cimport PyDict_GetItemString, PyDict_SetItemString from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM from cpython.bytes cimport PyBytes_FromStringAndSize from cpython.unicode cimport PyUnicode_DecodeUTF8 -from cpython.version cimport PY_MAJOR_VERSION from pysam.libchtslib cimport HTSFile, hisremote @@ -168,10 +168,7 @@ cdef inline bcf_str_cache_get_charptr(const char* s): if pystr: return pystr - if PY_MAJOR_VERSION < 3: - val = s - else: - val = PyUnicode_DecodeUTF8(s, strlen(s), NULL) + val = PyUnicode_DecodeUTF8(s, strlen(s), NULL) PyDict_SetItemString(bcf_str_cache, s, val) @@ -2094,14 +2091,15 @@ cdef class VariantHeader(object): if contig is not None: rec.contig = contig - if alleles is not None: - rec.alleles = alleles rec.start = start rec.stop = stop rec.id = id rec.qual = qual + if alleles is not None: + rec.alleles = alleles + if filter is not None: if isinstance(filter, (list, tuple, VariantRecordFilter)): for f in filter: diff --git a/pysam/libcbcftools.pxd b/pysam/libcbcftools.pxd index f8892ed..613353e 100644 --- a/pysam/libcbcftools.pxd +++ b/pysam/libcbcftools.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 cdef extern from "bcftools.pysam.h": int bcftools_dispatch(int argc, char *argv[]) diff --git a/pysam/libcbgzf.pyx b/pysam/libcbgzf.pyx index ede6463..0d88f8d 100644 --- a/pysam/libcbgzf.pyx +++ b/pysam/libcbgzf.pyx @@ -1,3 +1,4 @@ +# cython: language_level=3 """Functions that read and write block gzipped files. The user of the file doesn't have to worry about the compression @@ -213,7 +214,7 @@ cdef class BGZFile(object): line.l = line.m = 0 line.s = NULL - cdef int ret = bgzf_getline(self.bgzf, '\n', &line) + cdef int ret = bgzf_getline(self.bgzf, b'\n', &line) if ret == -1: s = b'' elif ret == -2: diff --git a/pysam/libcfaidx.pxd b/pysam/libcfaidx.pxd index 53ad767..c17d0ba 100644 --- a/pysam/libcfaidx.pxd +++ b/pysam/libcfaidx.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free diff --git a/pysam/libcfaidx.pyi b/pysam/libcfaidx.pyi index 5865701..232eeb6 100644 --- a/pysam/libcfaidx.pyi +++ b/pysam/libcfaidx.pyi @@ -34,11 +34,25 @@ class FastaFile: def __getitem__(self, reference: str) -> str: ... def __contains__(self, reference: str) -> bool: ... + +class FastqProxy: + @property + def name(self) -> str: ... + @property + def sequence(self) -> str: ... + @property + def comment(self) -> Optional[str]: ... + @property + def quality(self) -> Optional[str]: ... + def to_string(self) -> str: ... + def get_quality_array(self, offset: int = ...) -> Optional[array.array]: ... + + class FastxRecord: - comment: str = ... - quality: str = ... - sequence: str = ... - name: str = ... + comment: Optional[str] = ... + quality: Optional[str] = ... + sequence: Optional[str] = ... + name: Optional[str] = ... def __init__( self, name: Optional[str] = ..., diff --git a/pysam/libcfaidx.pyx b/pysam/libcfaidx.pyx index e73adf9..d4e7427 100644 --- a/pysam/libcfaidx.pyx +++ b/pysam/libcfaidx.pyx @@ -60,8 +60,6 @@ from cpython cimport PyErr_SetString, \ PyUnicode_Check, \ PyBytes_FromStringAndSize -from cpython.version cimport PY_MAJOR_VERSION - from pysam.libchtslib cimport \ faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \ faidx_seq_len, faidx_iseq, faidx_seq_len, \ diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index ed3ca92..30a1b76 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free diff --git a/pysam/libchtslib.pyx b/pysam/libchtslib.pyx index 3a9bbd2..760d268 100644 --- a/pysam/libchtslib.pyx +++ b/pysam/libchtslib.pyx @@ -1,3 +1,4 @@ +# cython: language_level=3 # cython: embedsignature=True # cython: profile=True # adds doc-strings for sphinx diff --git a/pysam/libcsamfile.pxd b/pysam/libcsamfile.pxd index dff1345..b9e9014 100644 --- a/pysam/libcsamfile.pxd +++ b/pysam/libcsamfile.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile ################################################# diff --git a/pysam/libcsamfile.pyx b/pysam/libcsamfile.pyx index bde93d8..e5c412f 100644 --- a/pysam/libcsamfile.pyx +++ b/pysam/libcsamfile.pyx @@ -17,8 +17,6 @@ from cpython cimport PyErr_SetString, \ PyUnicode_Check, \ PyBytes_FromStringAndSize -from cpython.version cimport PY_MAJOR_VERSION - from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment diff --git a/pysam/libcsamtools.pxd b/pysam/libcsamtools.pxd index 628d9a5..3d42be6 100644 --- a/pysam/libcsamtools.pxd +++ b/pysam/libcsamtools.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 cdef extern from "samtools.pysam.h": int samtools_dispatch(int argc, char *argv[]) diff --git a/pysam/libctabix.pxd b/pysam/libctabix.pxd index c986f03..174dd8b 100644 --- a/pysam/libctabix.pxd +++ b/pysam/libctabix.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 from libc.stdint cimport int8_t, int16_t, int32_t, int64_t from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libc.stdlib cimport malloc, calloc, realloc, free diff --git a/pysam/libctabix.pyx b/pysam/libctabix.pyx index 4436420..97e3403 100644 --- a/pysam/libctabix.pyx +++ b/pysam/libctabix.pyx @@ -1,3 +1,4 @@ +# cython: language_level=3 # cython: embedsignature=True # cython: profile=True ############################################################################### @@ -65,8 +66,6 @@ from cpython cimport PyErr_SetString, PyBytes_Check, \ PyUnicode_Check, PyBytes_FromStringAndSize, \ PyObject_AsFileDescriptor -from cpython.version cimport PY_MAJOR_VERSION - cimport pysam.libctabixproxies as ctabixproxies from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\ @@ -654,7 +653,7 @@ cdef class TabixIterator: if retval < 0: break - if self.buffer.s[0] != '#': + if self.buffer.s[0] != b'#': break return retval @@ -673,9 +672,6 @@ cdef class TabixIterator: return charptr_to_str(self.buffer.s, self.encoding) - def next(self): - return self.__next__() - def __dealloc__(self): if self.iterator != NULL: tbx_itr_destroy(self.iterator) @@ -689,9 +685,6 @@ class EmptyIterator: def __iter__(self): return self - def next(self): - raise StopIteration() - def __next__(self): raise StopIteration() @@ -764,7 +757,7 @@ cdef class GZIterator: cdef int retval = 0 while 1: with nogil: - retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) + retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) if retval < 0: break @@ -792,7 +785,7 @@ cdef class GZIteratorHead(GZIterator): cdef int retval = self.__cnext__() if retval < 0: raise StopIteration - if self.buffer.s[0] == '#': + if self.buffer.s[0] == b'#': return self.buffer.s else: raise StopIteration @@ -1152,7 +1145,7 @@ cdef class tabix_file_iterator: cdef int retval = 0 while 1: with nogil: - retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret) + retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret) if retval < 0: break @@ -1161,11 +1154,11 @@ cdef class tabix_file_iterator: b = self.buffer.s # skip comments - if (b[0] == '#'): + if (b[0] == b'#'): continue # skip empty lines - if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': + if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': continue # gzgets terminates at \n, no need to test @@ -1183,9 +1176,6 @@ cdef class tabix_file_iterator: def __next__(self): return self.__cnext__() - def next(self): - return self.__cnext__() - class tabix_generic_iterator: '''iterate over ``infile``. @@ -1225,18 +1215,18 @@ class tabix_generic_iterator: s = force_bytes(line, encoding) b = s nbytes = len(line) - assert b[nbytes] == '\0' + assert b[nbytes] == b'\0' # skip comments - if b[0] == '#': + if b[0] == b'#': continue # skip empty lines - if b[0] == '\0' or b[0] == '\n' or b[0] == '\r': + if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r': continue # make sure that entry is complete - if b[nbytes-1] != '\n' and b[nbytes-1] != '\r': + if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r': raise ValueError("incomplete line at %s" % line) bytes_cpy = b @@ -1246,10 +1236,6 @@ class tabix_generic_iterator: raise StopIteration - # python version - required for python 2.7 - def next(self): - return self.__next__() - def tabix_iterator(infile, parser): """return an iterator over all entries in a file. @@ -1261,23 +1247,8 @@ def tabix_iterator(infile, parser): :class:`~pysam.asGTF`). """ - if PY_MAJOR_VERSION >= 3: - return tabix_generic_iterator(infile, parser) - else: - return tabix_file_iterator(infile, parser) + return tabix_generic_iterator(infile, parser) - # file objects can use C stdio - # used to be: isinstance( infile, file): - # if PY_MAJOR_VERSION >= 3: - # if isinstance( infile, io.IOBase ): - # return tabix_copy_iterator( infile, parser ) - # else: - # return tabix_generic_iterator( infile, parser ) - # else: -# if isinstance( infile, file ): -# return tabix_copy_iterator( infile, parser ) -# else: -# return tabix_generic_iterator( infile, parser ) cdef class Tabixfile(TabixFile): """Tabixfile is deprecated: use TabixFile instead""" diff --git a/pysam/libctabixproxies.pxd b/pysam/libctabixproxies.pxd index 907b40d..5be5541 100644 --- a/pysam/libctabixproxies.pxd +++ b/pysam/libctabixproxies.pxd @@ -1,5 +1,4 @@ -#cdef extern from "Python.h": -# ctypedef struct FILE +# cython: language_level=3 from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t diff --git a/pysam/libctabixproxies.pyx b/pysam/libctabixproxies.pyx index 10b3e5a..9aebf0b 100644 --- a/pysam/libctabixproxies.pyx +++ b/pysam/libctabixproxies.pyx @@ -128,8 +128,8 @@ cdef class TupleProxy: if reset: for x from 0 <= x < nbytes: - if self.data[x] == '\0': - self.data[x] = '\t' + if self.data[x] == b'\0': + self.data[x] = b'\t' self.update(self.data, nbytes) @@ -175,8 +175,8 @@ cdef class TupleProxy: ################################# # remove line breaks and feeds and update number of bytes x = nbytes - 1 - while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'): - buffer[x] = '\0' + while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'): + buffer[x] = b'\0' x -= 1 self.nbytes = x + 1 @@ -198,7 +198,7 @@ cdef class TupleProxy: # to guess or dynamically grow if max_fields == 0: for x from 0 <= x < nbytes: - if buffer[x] == '\t': + if buffer[x] == b'\t': max_fields += 1 max_fields += 1 @@ -214,7 +214,7 @@ cdef class TupleProxy: old_pos = pos while 1: - pos = memchr(pos, '\t', nbytes) + pos = memchr(pos, b'\t', nbytes) if pos == NULL: break if field >= max_fields: @@ -222,7 +222,7 @@ cdef class TupleProxy: "parsing error: more than %i fields in line: %s" % (max_fields, buffer)) - pos[0] = '\0' + pos[0] = b'\0' pos += 1 self.fields[field] = pos field += 1 @@ -318,8 +318,8 @@ cdef class TupleProxy: raise ValueError("out of memory") memcpy(cpy, self.data, self.nbytes+1) for x from 0 <= x < self.nbytes: - if cpy[x] == '\0': - cpy[x] = '\t' + if cpy[x] == b'\0': + cpy[x] = b'\t' result = cpy[:self.nbytes] free(cpy) r = result.decode(self.encoding) diff --git a/pysam/libcutils.pxd b/pysam/libcutils.pxd index d78b706..de7f115 100644 --- a/pysam/libcutils.pxd +++ b/pysam/libcutils.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 ######################################################################### # Utility functions used across pysam ######################################################################### diff --git a/pysam/libcutils.pyx b/pysam/libcutils.pyx index 81a19d3..246c835 100644 --- a/pysam/libcutils.pyx +++ b/pysam/libcutils.pyx @@ -1,3 +1,4 @@ +# cython: language_level=3 import types import sys import string @@ -50,10 +51,7 @@ cpdef array_to_qualitystring(c_array.array qualities, int offset=33): for x from 0 <= x < len(qualities): result[x] = qualities[x] + offset - if IS_PYTHON3: - return force_str(result.tobytes()) - else: - return result.tostring() + return force_str(result.tobytes()) cpdef qualities_to_qualitystring(qualities, int offset=33): @@ -108,14 +106,8 @@ cpdef set_encoding_error_handler(name): ## Python 3 compatibility functions ######################################################################## -cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3 - cdef from_string_and_size(const char* s, size_t length): - if IS_PYTHON3: - return s[:length].decode('utf-8', ERROR_HANDLER) - else: - return s[:length] - + return s[:length].decode('utf-8', ERROR_HANDLER) # filename encoding (adapted from lxml.etree.pyx) cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii' @@ -125,15 +117,7 @@ cdef bytes encode_filename(object filename): """Make sure a filename is 8-bit encoded (or None).""" if filename is None: return None - elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2: - # Added to support path-like objects - return os.fsencode(filename) - elif PyBytes_Check(filename): - return filename - elif PyUnicode_Check(filename): - return filename.encode(FILENAME_ENCODING) - else: - raise TypeError("Argument must be string or unicode.") + return os.fsencode(filename) cdef bytes force_bytes(object s, encoding=None, errors=None): @@ -153,19 +137,13 @@ cdef bytes force_bytes(object s, encoding=None, errors=None): cdef charptr_to_str(const char* s, encoding=None, errors=None): if s == NULL: return None - if PY_MAJOR_VERSION < 3: - return s - else: - return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) + return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None): if s == NULL: return None - if PY_MAJOR_VERSION < 3: - return s[:n] - else: - return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) + return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None): @@ -180,13 +158,10 @@ cdef force_str(object s, encoding=None, errors=None): (bytes in Py2, unicode in Py3)""" if s is None: return None - if PY_MAJOR_VERSION < 3: - return s - elif PyBytes_Check(s): + if PyBytes_Check(s): return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER) - else: - # assume unicode - return s + # assume unicode + return s cdef decode_bytes(bytes s, encoding=None, errors=None): @@ -305,19 +280,18 @@ def _pysam_dispatch(collection, ''' if method == "index" and args: - # We make sure that at least 1 input file exists, + # We make sure that at least the first specified input file exists, # and if it doesn't we raise an IOError. - SIMPLE_FLAGS = ['-c', '--csi', '-f', '--force', '-t', '--tbi', '-n', '--nstats', '-s', '--stats'] - ARGUMENTS = ['-m', '--min-shift', '-o', '--output-file', '--threads', '-@'] + ARGUMENTS = ['-m', '--min-shift', '-o', '--output', '--output-file', '-@', '--threads'] skip_next = False for arg in args: if skip_next: skip_next = False continue - if arg in SIMPLE_FLAGS or (len(arg) > 2 and force_bytes(arg).startswith(b'-@')): - continue - if arg in ARGUMENTS: - skip_next = True + if arg.startswith('-'): + # Skip next argument for e.g. '--min-shift' '12' or '-m' '12' but not '-m12' + if arg in ARGUMENTS: + skip_next = True continue if not os.path.exists(arg): raise IOError("No such file or directory: '%s'" % arg) diff --git a/pysam/libcvcf.pxd b/pysam/libcvcf.pxd index 139597f..7a635b1 100644 --- a/pysam/libcvcf.pxd +++ b/pysam/libcvcf.pxd @@ -1,2 +1,2 @@ - +# cython: language_level=3 diff --git a/pysam/samtools.py b/pysam/samtools.py index a90d32c..046658f 100644 --- a/pysam/samtools.py +++ b/pysam/samtools.py @@ -1,55 +1,197 @@ +import platform +from typing import ( + Callable, + List, + Tuple, + Iterable, + Union, +) +try: + from typing import Final + HAVE_FINAL = True +except ImportError: + HAVE_FINAL = False + from pysam.utils import PysamDispatcher + # samtools command line options to export in python -SAMTOOLS_DISPATCH = { +_SAMTOOLS_DISPATCH = { # samtools 'documented' commands - "view": ("view", None), - "head": ("head", None), - "sort": ("sort", None), - "mpileup": ("mpileup", None), - "consensus": ("consensus", None), - "depth": ("depth", None), - "faidx": ("faidx", None), - "fqidx": ("fqidx", None), - "tview": ("tview", None), - "index": ("index", None), - "idxstats": ("idxstats", None), - "fixmate": ("fixmate", None), - "flagstat": ("flagstat", None), - "calmd": ("calmd", None), - "merge": ("merge", None), - "markdup": ("markdup", None), - "rmdup": ("rmdup", None), - "reference": ("reference", None), - "reheader": ("reheader", None), - "cat": ("cat", None), - "targetcut": ("targetcut", None), - "phase": ("phase", None), - "bam2fq": ("bam2fq", None), - "dict": ("dict", None), - "addreplacerg": ("addreplacerg", None), - "pad2unpad": ("pad2unpad", None), - "depad": ("pad2unpad", None), - "bedcov": ("bedcov", None), - "coverage": ("coverage", None), - "bamshuf": ("bamshuf", None), - "collate": ("collate", None), - "stats": ("stats", None), - "fasta": ("fasta", None), - "fastq": ("fastq", None), - "quickcheck": ("quickcheck", None), - "split": ("split", None), - "flags": ("flags", None), - "ampliconclip": ("ampliconclip", None), - "ampliconstats": ("ampliconstats", None), - "version": ("version", None), - "fqimport": ("import", None), - "samples": ("samples", None), + "view": ("view", ()), + "head": ("head", ()), + "sort": ("sort", ()), + "mpileup": ("mpileup", ()), + "consensus": ("consensus", ()), + "depth": ("depth", ()), + "faidx": ("faidx", ()), + "fqidx": ("fqidx", ()), + "tview": ("tview", ()), + "index": ("index", ()), + "idxstats": ("idxstats", ()), + "fixmate": ("fixmate", ()), + "flagstat": ("flagstat", ()), + "calmd": ("calmd", ()), + "merge": ("merge", ()), + "markdup": ("markdup", ()), + "rmdup": ("rmdup", ()), + "reference": ("reference", ()), + "reheader": ("reheader", ()), + "reset": ("reset", ()), + "cat": ("cat", ()), + "targetcut": ("targetcut", ()), + "phase": ("phase", ()), + "bam2fq": ("bam2fq", ()), + "dict": ("dict", ()), + "addreplacerg": ("addreplacerg", ()), + "pad2unpad": ("pad2unpad", ()), + "depad": ("pad2unpad", ()), + "bedcov": ("bedcov", ()), + "coverage": ("coverage", ()), + "bamshuf": ("bamshuf", ()), + "collate": ("collate", ()), + "stats": ("stats", ()), + "fasta": ("fasta", ()), + "fastq": ("fastq", ()), + "cram_size": ("cram-size", ()), + "quickcheck": ("quickcheck", ()), + "split": ("split", ()), + "flags": ("flags", ()), + "ampliconclip": ("ampliconclip", ()), + "ampliconstats": ("ampliconstats", ()), + "version": ("version", ()), + "fqimport": ("import", ()), + "import_": ("import", ()), + "samples": ("samples", ()), } -# instantiate samtools commands as python functions -for key, options in SAMTOOLS_DISPATCH.items(): - cmd, parser = options - globals()[key] = PysamDispatcher("samtools", cmd, parser) -__all__ = list(SAMTOOLS_DISPATCH) +def _wrap_command( + dispatch: str, + parsers: Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]], +) -> PysamDispatcher: + return PysamDispatcher("samtools", dispatch, parsers) + + +if not HAVE_FINAL: + # python 3.7 + for key, options in _SAMTOOLS_DISPATCH.items(): + cmd, parser = options + globals()[key] = PysamDispatcher("samtools", cmd, parser) + + __all__ = list(_SAMTOOLS_DISPATCH) +else: + # python >=3.8 + view: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["view"][0], _SAMTOOLS_DISPATCH["view"][1]) + + head: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["head"][0], _SAMTOOLS_DISPATCH["head"][1]) + + sort: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["sort"][0], _SAMTOOLS_DISPATCH["sort"][1]) + + mpileup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["mpileup"][0], _SAMTOOLS_DISPATCH["mpileup"][1]) + + consensus: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["consensus"][0], + _SAMTOOLS_DISPATCH["consensus"][1], + ) + + depth: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["depth"][0], _SAMTOOLS_DISPATCH["depth"][1]) + + faidx: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["faidx"][0], _SAMTOOLS_DISPATCH["faidx"][1]) + + fqidx: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fqidx"][0], _SAMTOOLS_DISPATCH["fqidx"][1]) + + tview: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["tview"][0], _SAMTOOLS_DISPATCH["tview"][1]) + + index: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["index"][0], _SAMTOOLS_DISPATCH["index"][1]) + + idxstats: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["idxstats"][0], _SAMTOOLS_DISPATCH["idxstats"][1]) + + fixmate: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fixmate"][0], _SAMTOOLS_DISPATCH["fixmate"][1]) + + flagstat: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["flagstat"][0], _SAMTOOLS_DISPATCH["flagstat"][1]) + + calmd: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["calmd"][0], _SAMTOOLS_DISPATCH["calmd"][1]) + + merge: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["merge"][0], _SAMTOOLS_DISPATCH["merge"][1]) + + markdup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["markdup"][0], _SAMTOOLS_DISPATCH["markdup"][1]) + + rmdup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["rmdup"][0], _SAMTOOLS_DISPATCH["rmdup"][1]) + + reference: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["reference"][0], + _SAMTOOLS_DISPATCH["reference"][1], + ) + + reheader: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["reheader"][0], _SAMTOOLS_DISPATCH["reheader"][1]) + + reset: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["reset"][0], _SAMTOOLS_DISPATCH["reset"][1]) + + cat: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["cat"][0], _SAMTOOLS_DISPATCH["cat"][1]) + + targetcut: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["targetcut"][0], + _SAMTOOLS_DISPATCH["targetcut"][1], + ) + + phase: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["phase"][0], _SAMTOOLS_DISPATCH["phase"][1]) + + bam2fq: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bam2fq"][0], _SAMTOOLS_DISPATCH["bam2fq"][1]) + + dict: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["dict"][0], _SAMTOOLS_DISPATCH["dict"][1]) + + addreplacerg: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["addreplacerg"][0], + _SAMTOOLS_DISPATCH["addreplacerg"][1], + ) + + pad2unpad: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["pad2unpad"][0], + _SAMTOOLS_DISPATCH["pad2unpad"][1], + ) + + depad: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["depad"][0], _SAMTOOLS_DISPATCH["depad"][1]) + + bedcov: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bedcov"][0], _SAMTOOLS_DISPATCH["bedcov"][1]) + + coverage: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["coverage"][0], _SAMTOOLS_DISPATCH["coverage"][1]) + + bamshuf: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bamshuf"][0], _SAMTOOLS_DISPATCH["bamshuf"][1]) + + collate: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["collate"][0], _SAMTOOLS_DISPATCH["collate"][1]) + + stats: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["stats"][0], _SAMTOOLS_DISPATCH["stats"][1]) + + fasta: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fasta"][0], _SAMTOOLS_DISPATCH["fasta"][1]) + + fastq: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fastq"][0], _SAMTOOLS_DISPATCH["fastq"][1]) + + cram_size: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["cram_size"][0], _SAMTOOLS_DISPATCH["cram_size"][1]) + + quickcheck: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["quickcheck"][0], + _SAMTOOLS_DISPATCH["quickcheck"][1], + ) + + split: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["split"][0], _SAMTOOLS_DISPATCH["split"][1]) + + flags: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["flags"][0], _SAMTOOLS_DISPATCH["flags"][1]) + + ampliconclip: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["ampliconclip"][0], + _SAMTOOLS_DISPATCH["ampliconclip"][1], + ) + + ampliconstats: Final[PysamDispatcher] = _wrap_command( + _SAMTOOLS_DISPATCH["ampliconstats"][0], + _SAMTOOLS_DISPATCH["ampliconstats"][1], + ) + + version: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["version"][0], _SAMTOOLS_DISPATCH["version"][1]) + + fqimport: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fqimport"][0], _SAMTOOLS_DISPATCH["fqimport"][1]) + + import_: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["import_"][0], _SAMTOOLS_DISPATCH["import_"][1]) + + samples: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["samples"][0], _SAMTOOLS_DISPATCH["samples"][1]) diff --git a/pysam/utils.py b/pysam/utils.py index fcc4434..d15f431 100644 --- a/pysam/utils.py +++ b/pysam/utils.py @@ -1,3 +1,11 @@ +from typing import ( + Callable, + List, + Tuple, + Iterable, + Union, +) + from pysam.libcutils import _pysam_dispatch @@ -36,14 +44,20 @@ class PysamDispatcher(object): parsers = None collection = None - def __init__(self, collection, dispatch, parsers): + def __init__( + self, + collection: str, + dispatch: str, + parsers: Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]], + ): self.collection = collection self.dispatch = dispatch self.parsers = parsers self.stderr = [] - def __call__(self, *args, **kwargs): - '''execute a samtools command. + def __call__(self, *args: str, **kwargs) -> Union[str, List[str]]: + ''' + execute a samtools command. Keyword arguments: catch_stdout -- redirect stdout from the samtools command and diff --git a/pysam/version.h b/pysam/version.h index dddd49c..6d353c5 100644 --- a/pysam/version.h +++ b/pysam/version.h @@ -1,5 +1,5 @@ // Version information used while compiling samtools, bcftools, and htslib -#define SAMTOOLS_VERSION "1.16.1 (pysam)" -#define BCFTOOLS_VERSION "1.16 (pysam)" -#define HTS_VERSION_TEXT "1.16 (pysam)" +#define SAMTOOLS_VERSION "1.17 (pysam)" +#define BCFTOOLS_VERSION "1.17 (pysam)" +#define HTS_VERSION_TEXT "1.17 (pysam)" diff --git a/pysam/version.py b/pysam/version.py index b6aede0..78b3ffd 100644 --- a/pysam/version.py +++ b/pysam/version.py @@ -1,6 +1,6 @@ # pysam versioning information -__version__ = "0.20.0" +__version__ = "0.21.0" -__samtools_version__ = "1.16.1" -__bcftools_version__ = "1.16" -__htslib_version__ = "1.16" +__samtools_version__ = "1.17" +__bcftools_version__ = "1.17" +__htslib_version__ = "1.17" diff --git a/samtools/LICENSE b/samtools/LICENSE index a14e403..e72eb63 100644 --- a/samtools/LICENSE +++ b/samtools/LICENSE @@ -1,6 +1,6 @@ The MIT/Expat License -Copyright (C) 2008-2022 Genome Research Ltd. +Copyright (C) 2008-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/samtools/README b/samtools/README index 7be5383..60b37ac 100644 --- a/samtools/README +++ b/samtools/README @@ -9,7 +9,7 @@ Building samtools The typical simple case of building Samtools using the HTSlib bundled within this Samtools release tarball is done as follows: - cd .../samtools-1.16.1 # Within the unpacked release directory + cd .../samtools-1.17 # Within the unpacked release directory ./configure make @@ -21,7 +21,7 @@ install samtools etc properly into a directory of your choosing. Building for installation using the HTSlib bundled within this Samtools release tarball, and building the various HTSlib utilities such as bgzip is done as follows: - cd .../samtools-1.16.1 # Within the unpacked release directory + cd .../samtools-1.17 # Within the unpacked release directory ./configure --prefix=/path/to/location make all all-htslib make install install-htslib @@ -48,7 +48,7 @@ There are two advantages to this: To build with plug-ins, you need to use the --enable-plugins configure option as follows: - cd .../samtools-1.16.1 # Within the unpacked release directory + cd .../samtools-1.17 # Within the unpacked release directory ./configure --enable-plugins --prefix=/path/to/location make all all-htslib make install install-htslib @@ -66,8 +66,8 @@ Setting --with-plugin-path is useful if you want to run directly from the source distribution instead of installing the package. In that case you can use: - cd .../samtools-1.16.1 # Within the unpacked release directory - ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.16 + cd .../samtools-1.17 # Within the unpacked release directory + ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.17 make all all-htslib It is possible to override the built-in search path using the HTS_PATH diff --git a/samtools/amplicon_stats.c b/samtools/amplicon_stats.c index 62bb15c..3842fb3 100644 --- a/samtools/amplicon_stats.c +++ b/samtools/amplicon_stats.c @@ -1475,6 +1475,8 @@ static int amplicon_stats(astats_args_t *args, } int r; for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; if (!amps[r].ref || strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 || amps[r].len != sam_hdr_tid2len(header, r)) { diff --git a/samtools/amplicon_stats.c.pysam.c b/samtools/amplicon_stats.c.pysam.c index aa09459..b71ac4a 100644 --- a/samtools/amplicon_stats.c.pysam.c +++ b/samtools/amplicon_stats.c.pysam.c @@ -1477,6 +1477,8 @@ static int amplicon_stats(astats_args_t *args, } int r; for (r = 0; r < nref; r++) { + if (!amps[r].sites) + continue; if (!amps[r].ref || strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 || amps[r].len != sam_hdr_tid2len(header, r)) { diff --git a/samtools/bam2depth.c b/samtools/bam2depth.c index 098d3ae..9266b61 100644 --- a/samtools/bam2depth.c +++ b/samtools/bam2depth.c @@ -72,6 +72,8 @@ typedef struct { typedef struct { int header; int flag; + int incl_flag; + int require_flag; int min_qual; int min_mqual; int min_len; @@ -564,7 +566,11 @@ static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, if (b[i]->core.tid < 0) continue; if (b[i]->core.flag & opt->flag) - continue; + continue; // must have none of the flags set + if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0) + continue; // must have at least one flag set + if ((b[i]->core.flag & opt->require_flag) != opt->require_flag) + continue; // must have all lags set if (b[i]->core.qual < opt->min_mqual) continue; @@ -654,7 +660,11 @@ static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, if (b[i]->core.tid < 0) continue; if (b[i]->core.flag & opt->flag) - continue; + continue; // must have none of the flags set + if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0) + continue; // must have at least one flag set + if ((b[i]->core.flag & opt->require_flag) != opt->require_flag) + continue; // must have all lags set if (b[i]->core.qual < opt->min_mqual) continue; @@ -712,8 +722,14 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -b FILE Use bed FILE for list of regions\n"); fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n"); fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n"); - fprintf(fp, " -g INT Remove specified flags from default flag filter\n"); - fprintf(fp, " -G INT Add specified flags to the default flag filter\n"); + fprintf(fp, " -g INT Remove specified flags from default filter-out flag list\n"); + fprintf(fp, " -G, --excl-flags FLAGS\n"); + fprintf(fp, " Add specified flags to the default filter-out flag list\n"); + fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); + fprintf(fp, " --incl-flags FLAGS\n"); + fprintf(fp, " Only include records with at least one the FLAGs present [0]\n"); + fprintf(fp, " --require-flags FLAGS\n"); + fprintf(fp, " Only include records with all of the FLAGs present [0]\n"); fprintf(fp, " -H Print a file header line\n"); fprintf(fp, " -l INT Minimum read length [0]\n"); fprintf(fp, " -o FILE Write output to FILE [stdout]\n"); @@ -721,7 +737,6 @@ static void usage_exit(FILE *fp, int exit_status) " Filter bases with base quality smaller than INT [0]\n"); fprintf(fp, " -Q, --min-MQ INT\n" " Filter alignments with mapping quality smaller than INT [0]\n"); - fprintf(fp, " -H Print a file header\n"); fprintf(fp, " -J Include reads with deletions in depth computation\n"); fprintf(fp, " -s Do not count overlapping reads within a template\n"); sam_global_opt_help(fp, "-.--.@-."); @@ -738,6 +753,8 @@ int main_depth(int argc, char *argv[]) char *out_file = NULL; depth_opt opt = { .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, + .incl_flag = 0, + .require_flag = 0, .min_qual = 0, .min_mqual = 0, .skip_del = 1, @@ -752,10 +769,13 @@ int main_depth(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - {"min-MQ", required_argument, NULL, 'Q'}, - {"min-mq", required_argument, NULL, 'Q'}, - {"min-BQ", required_argument, NULL, 'q'}, - {"min-bq", required_argument, NULL, 'q'}, + {"min-MQ", required_argument, NULL, 'Q'}, + {"min-mq", required_argument, NULL, 'Q'}, + {"min-BQ", required_argument, NULL, 'q'}, + {"min-bq", required_argument, NULL, 'q'}, + {"excl-flags", required_argument, NULL, 'G'}, + {"incl-flags", required_argument, NULL, 1}, + {"require-flags", required_argument, NULL, 2}, SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), {NULL, 0, NULL, 0} }; @@ -788,9 +808,15 @@ int main_depth(int argc, char *argv[]) case 'g': opt.flag &= ~bam_str2flag(optarg); break; - case 'G': + case 'G': // reject if any set opt.flag |= bam_str2flag(optarg); break; + case 1: // reject unless at least one set (0 means ignore option) + opt.incl_flag |= bam_str2flag(optarg); + break; + case 2: // reject unless all set + opt.require_flag |= bam_str2flag(optarg); + break; case 'l': opt.min_len = atoi(optarg); diff --git a/samtools/bam2depth.c.pysam.c b/samtools/bam2depth.c.pysam.c index e713822..abe6141 100644 --- a/samtools/bam2depth.c.pysam.c +++ b/samtools/bam2depth.c.pysam.c @@ -74,6 +74,8 @@ typedef struct { typedef struct { int header; int flag; + int incl_flag; + int require_flag; int min_qual; int min_mqual; int min_len; @@ -566,7 +568,11 @@ static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, if (b[i]->core.tid < 0) continue; if (b[i]->core.flag & opt->flag) - continue; + continue; // must have none of the flags set + if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0) + continue; // must have at least one flag set + if ((b[i]->core.flag & opt->require_flag) != opt->require_flag) + continue; // must have all lags set if (b[i]->core.qual < opt->min_mqual) continue; @@ -656,7 +662,11 @@ static int fastdepth_core(depth_opt *opt, uint32_t nfiles, char **fn, if (b[i]->core.tid < 0) continue; if (b[i]->core.flag & opt->flag) - continue; + continue; // must have none of the flags set + if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0) + continue; // must have at least one flag set + if ((b[i]->core.flag & opt->require_flag) != opt->require_flag) + continue; // must have all lags set if (b[i]->core.qual < opt->min_mqual) continue; @@ -714,8 +724,14 @@ static void usage_exit(FILE *fp, int exit_status) fprintf(fp, " -b FILE Use bed FILE for list of regions\n"); fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n"); fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n"); - fprintf(fp, " -g INT Remove specified flags from default flag filter\n"); - fprintf(fp, " -G INT Add specified flags to the default flag filter\n"); + fprintf(fp, " -g INT Remove specified flags from default filter-out flag list\n"); + fprintf(fp, " -G, --excl-flags FLAGS\n"); + fprintf(fp, " Add specified flags to the default filter-out flag list\n"); + fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); + fprintf(fp, " --incl-flags FLAGS\n"); + fprintf(fp, " Only include records with at least one the FLAGs present [0]\n"); + fprintf(fp, " --require-flags FLAGS\n"); + fprintf(fp, " Only include records with all of the FLAGs present [0]\n"); fprintf(fp, " -H Print a file header line\n"); fprintf(fp, " -l INT Minimum read length [0]\n"); fprintf(fp, " -o FILE Write output to FILE [samtools_stdout]\n"); @@ -723,7 +739,6 @@ static void usage_exit(FILE *fp, int exit_status) " Filter bases with base quality smaller than INT [0]\n"); fprintf(fp, " -Q, --min-MQ INT\n" " Filter alignments with mapping quality smaller than INT [0]\n"); - fprintf(fp, " -H Print a file header\n"); fprintf(fp, " -J Include reads with deletions in depth computation\n"); fprintf(fp, " -s Do not count overlapping reads within a template\n"); sam_global_opt_help(fp, "-.--.@-."); @@ -740,6 +755,8 @@ int main_depth(int argc, char *argv[]) char *out_file = NULL; depth_opt opt = { .flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL, + .incl_flag = 0, + .require_flag = 0, .min_qual = 0, .min_mqual = 0, .skip_del = 1, @@ -754,10 +771,13 @@ int main_depth(int argc, char *argv[]) sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { - {"min-MQ", required_argument, NULL, 'Q'}, - {"min-mq", required_argument, NULL, 'Q'}, - {"min-BQ", required_argument, NULL, 'q'}, - {"min-bq", required_argument, NULL, 'q'}, + {"min-MQ", required_argument, NULL, 'Q'}, + {"min-mq", required_argument, NULL, 'Q'}, + {"min-BQ", required_argument, NULL, 'q'}, + {"min-bq", required_argument, NULL, 'q'}, + {"excl-flags", required_argument, NULL, 'G'}, + {"incl-flags", required_argument, NULL, 1}, + {"require-flags", required_argument, NULL, 2}, SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'), {NULL, 0, NULL, 0} }; @@ -790,9 +810,15 @@ int main_depth(int argc, char *argv[]) case 'g': opt.flag &= ~bam_str2flag(optarg); break; - case 'G': + case 'G': // reject if any set opt.flag |= bam_str2flag(optarg); break; + case 1: // reject unless at least one set (0 means ignore option) + opt.incl_flag |= bam_str2flag(optarg); + break; + case 2: // reject unless all set + opt.require_flag |= bam_str2flag(optarg); + break; case 'l': opt.min_len = atoi(optarg); diff --git a/samtools/bam_ampliconclip.c b/samtools/bam_ampliconclip.c index 2cf1ac1..91fc858 100644 --- a/samtools/bam_ampliconclip.c +++ b/samtools/bam_ampliconclip.c @@ -1,7 +1,7 @@ /* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads from the 5' end. - Copyright (C) 2020-2021 Genome Research Ltd. + Copyright (C) 2020-2022 Genome Research Ltd. Authors: Andrew Whitwham Rob Davies @@ -308,18 +308,6 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); memcpy(rec_out->data, rec->data, rec->core.l_qname); - if (clipping == hard_clip && bases >= rec->core.l_qseq) { - rec_out->core.l_qseq = 0; - rec_out->core.n_cigar = 0; - - if (orig_l_aux) - memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); - - rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; - - return 0; - } - // Modify CIGAR new_cigar = bam_get_cigar(rec_out); @@ -355,6 +343,19 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, qry_removed += ref_remove; } } else { + if (clipping == hard_clip) { + + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; + } + qry_removed = rec->core.l_qseq; } @@ -457,17 +458,6 @@ static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases, memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); memcpy(rec_out->data, rec->data, rec->core.l_qname); - if (clipping == hard_clip && bases >= rec->core.l_qseq) { - rec_out->core.l_qseq = 0; - rec_out->core.n_cigar = 0; - - if (orig_l_aux) - memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); - - rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; - return 0; - } - // Modify CIGAR here new_cigar = bam_get_cigar(rec_out); @@ -500,6 +490,19 @@ static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases, if (qry_removed > 0) j++; if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++; } else { + if (clipping == hard_clip) { + + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; + } + qry_removed = rec->core.l_qseq; j = 0; if (hardclip > 0 && clipping == soft_clip) j++; diff --git a/samtools/bam_ampliconclip.c.pysam.c b/samtools/bam_ampliconclip.c.pysam.c index 1feda1d..4eb9c5a 100644 --- a/samtools/bam_ampliconclip.c.pysam.c +++ b/samtools/bam_ampliconclip.c.pysam.c @@ -3,7 +3,7 @@ /* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads from the 5' end. - Copyright (C) 2020-2021 Genome Research Ltd. + Copyright (C) 2020-2022 Genome Research Ltd. Authors: Andrew Whitwham Rob Davies @@ -310,18 +310,6 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); memcpy(rec_out->data, rec->data, rec->core.l_qname); - if (clipping == hard_clip && bases >= rec->core.l_qseq) { - rec_out->core.l_qseq = 0; - rec_out->core.n_cigar = 0; - - if (orig_l_aux) - memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); - - rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; - - return 0; - } - // Modify CIGAR new_cigar = bam_get_cigar(rec_out); @@ -357,6 +345,19 @@ static int bam_trim_left(bam1_t *rec, bam1_t *rec_out, uint32_t bases, qry_removed += ref_remove; } } else { + if (clipping == hard_clip) { + + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; + } + qry_removed = rec->core.l_qseq; } @@ -459,17 +460,6 @@ static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases, memcpy(&rec_out->core, &rec->core, sizeof(rec->core)); memcpy(rec_out->data, rec->data, rec->core.l_qname); - if (clipping == hard_clip && bases >= rec->core.l_qseq) { - rec_out->core.l_qseq = 0; - rec_out->core.n_cigar = 0; - - if (orig_l_aux) - memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); - - rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; - return 0; - } - // Modify CIGAR here new_cigar = bam_get_cigar(rec_out); @@ -502,6 +492,19 @@ static int bam_trim_right(bam1_t *rec, bam1_t *rec_out, uint32_t bases, if (qry_removed > 0) j++; if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++; } else { + if (clipping == hard_clip) { + + rec_out->core.l_qseq = 0; + rec_out->core.n_cigar = 0; + + if (orig_l_aux) + memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux); + + rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux; + + return 0; + } + qry_removed = rec->core.l_qseq; j = 0; if (hardclip > 0 && clipping == soft_clip) j++; diff --git a/samtools/bam_consensus.c b/samtools/bam_consensus.c index 072dcd3..4cdaf3f 100644 --- a/samtools/bam_consensus.c +++ b/samtools/bam_consensus.c @@ -1,7 +1,7 @@ /* bam_consensus.c -- consensus subcommand. Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source) - Copyright (C) 2003-2005,2007-2022 Genome Research Ltd. + Copyright (C) 2003-2005,2007-2023 Genome Research Ltd. Author: James Bonfield @@ -99,6 +99,30 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // but 30T+ 20T- 18A+ 2A- seems like a consistent A miscall on one strand // only, while T is spread evenly across both strands. +// TODO: Phasing of long reads. +// Long reads offer very strong phasing opportunities for SNPs. +// From these, we get strong evidence for accuracy of indels. +// Specifically whether the distribution of poly-len within a phases +// is significantly different to the distribution of poly len between +// phases. + +// TODO end STR trimming. Eg: +// REF AAGCTGAAAAGTTAATGTCTTATTTTTTTTTTTTTTTTGAGATGGAGTC +// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc +// aagctgaaaagttaatgtcttattttttttt +// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc +// Middle seq doesn't validate those initial T alignments. +// Qual_train solves this by use of the STR trimmer. + +// TODO add a weight for proximity to homopolymer. +// Maybe length/distance? So 3 away from a 12-mer is similar to 1 away +// from a 4-mer? + +// TODO: Count number of base types between this point and the nearest +// indel or end of read. Eg GATCGAGAGAG*TAGC => 2 (A and G). +// adj is nbase/4 * score, or (nbase+1)/5? +// Perhaps multiplied by length too, to get local complexity score? + #include #include @@ -110,6 +134,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "samtools.h" #include "sam_opts.h" @@ -129,6 +154,21 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # define MAX(a,b) ((a)>(b)?(a):(b)) #endif +// Defines for experiment code which is currently disabled + +// Hardy-Weinberg statistics to check heterozygous sites match allelic +// frequencies. +//#define DO_HDW + +// Filter bayesian calls by min-depth and min-fract parameters +//#define DO_FRACT + +// Checks uniqueness of surrounding bases to adjust scores +//#define K2 2 + +// Look for strand bias in distribution of homopolymer lengths +//#define DO_POLY_DIST + // Minimum cutoff for storing mod data; => at least 10% chance #define MOD_CUTOFF 0.46 @@ -140,6 +180,14 @@ enum format { typedef unsigned char uc; +// Simple recalibration table for substitutions, undercalls and overcalls. +// In future, we'll update this to be kmer based too. +typedef struct { + int smap[101]; // substituion or SNP + int umap[101]; // undercall or DEL + int omap[101]; // overcall or INS +} qcal_t; + typedef struct { // User options char *reg; @@ -156,7 +204,7 @@ typedef struct { int min_depth; double call_fract; double het_fract; - int gap5; + int mode; // One of MODE_* macros below enum format fmt; int cons_cutoff; int ambig; @@ -166,10 +214,16 @@ typedef struct { int all_bases; int show_del; int show_ins; + int mark_ins; int excl_flags; int incl_flags; int min_mqual; double P_het; + double P_indel; + double het_scale; + double homopoly_fix; + double homopoly_redux; + qcal_t qcal; // Internal state samFile *fp; @@ -221,7 +275,10 @@ typedef struct { float discrep; } consensus_t; -#define P_HET 1e-4 +#define P_HET 1e-3 +#define P_INDEL 2e-4 +#define P_HOMOPOLY 0.5 +#define P_HET_SCALE 1.0 #define LOG10 2.30258509299404568401 #define TENOVERLOG10 4.34294481903251827652 @@ -233,20 +290,38 @@ typedef struct { #define ALIGNED(x) #endif -static double prior[25] ALIGNED(16); /* Sum to 1.0 */ -static double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */ - -/* Precomputed matrices for the consensus algorithm */ -static double pMM[101] ALIGNED(16); -static double p__[101] ALIGNED(16); -static double p_M[101] ALIGNED(16); - +// Initialised once as a global array. This won't work if threaded, +// but we'll rewrite if and when that gets added later. static double e_tab_a[1002] ALIGNED(16); static double *e_tab = &e_tab_a[500]; static double e_tab2_a[1002] ALIGNED(16); static double *e_tab2 = &e_tab2_a[500]; static double e_log[501] ALIGNED(16); +/* Precomputed matrices for the consensus algorithm */ +typedef struct { + double prior[25] ALIGNED(16); /* Sum to 1.0 */ + double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */ + + double pMM[101] ALIGNED(16); + double p__[101] ALIGNED(16); + double p_M[101] ALIGNED(16); + double po_[101] ALIGNED(16); + double poM[101] ALIGNED(16); + double poo[101] ALIGNED(16); + double puu[101] ALIGNED(16); + double pum[101] ALIGNED(16); + double pmm[101] ALIGNED(16); + + // Multiplier on homopolymer length before reducing phred qual + double poly_mul; +} cons_probs; + +// Two sets of params; recall oriented (gap5) and precision (stf). +// We use the former unless MODE_MIXED is set (which is the default +// for bayesian consensus mode if P_indel is significant). +static cons_probs cons_prob_recall, cons_prob_precise; + /* * Lots of confusing matrix terms here, so some definitions will help. * @@ -284,11 +359,327 @@ static double e_log[501] ALIGNED(16); * The heterozygosity weight though is a per column calculation as we're * trying to model whether the column is pure or mixed. Hence this is done * once via a prior and has no affect on the individual matrix cells. + * + * We have a generic indel probability, but it's a catch all for overcall, + * undercall, alignment artifacts, homopolymer issues, etc. So we can set + * it considerably higher and just let the QUAL skew do the filtering for + * us, albeit no longer well calibrated. */ -static void consensus_init(double p_het) { +// NB: Should _M be MM? +// Ie sample really is A/C het, and we observe C. That should be a match, +// not half a match. + +#define MODE_SIMPLE 0 // freq counting + +#define MODE_BAYES_116 1 // Samtools 1.16 (no indel param) +#define MODE_RECALL 2 // so called as it's the params from Gap5 +#define MODE_PRECISE 3 // a more precise set; +FN, --FP +#define MODE_MIXED 4 // Combination of GAP5/BAYES + +#define QCAL_FLAT 0 +#define QCAL_HIFI 1 +#define QCAL_HISEQ 2 +#define QCAL_ONT_R10_4_SUP 3 +#define QCAL_ONT_R10_4_DUP 4 +#define QCAL_ULTIMA 5 + +// Calibration tables here don't necessarily reflect the true accuracy. +// They have been manually tuned to work in conjunction with other command +// line parameters used in the machine profiles. For example reducing one +// qual here and increasing sensitivity elsewhere via another parameter. +static qcal_t static_qcal[6] = { + { // FLAT + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99} + }, + + { // HiFi + {10, 11, 11, 12, 13, 14, 15, 16, 18, 19, + 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, + 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 38, 39, 39, 40, 40, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + }, + { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, + { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + } + }, + + { // HiSeq + { 2, 2, 2, 3, 3, 4, 5, 5, 6, 7, + 8, 9, 10, 11, 11, 12, 13, 14, 15, 16, + 17, 17, 18, 19, 20, 21, 22, 22, 23, 24, + 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, + 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, + 42, 43, 44, 45, 45, 46, 47, 48, 49, 50, + 51, 51, 52, 53, 54, 55, 56, 56, 57, 58, + 59, 60, 61, 62, 62, 63, 64, 65, 66, 67, + 68, 68, 69, 70, 71, 72, 73, 73, 74, 75, + 76, 77, 78, 79, 79, 80, 81, 82, 83, 84, + }, + { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, + 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, + 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, + 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, + 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, + 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, + 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, + 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, + 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, + 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, + }, + { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, + 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, + 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, + 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, + 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, + 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, + 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, + 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, + 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, + 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, + } + }, + { // ONT R10.4 super + { 0, 2, 2, 2, 3, 4, 4, 5, 6, 7, + 7, 8, 9, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, + 30, 31, 33, 34, 36, 37, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 41, 40, 40, + 41, 41, 40, 40, 40, 40, 41, 40, 40, 40, + 40, 41, 41, 40, 40, 41, 40, 40, 39, 41, + 40, 41, 40, 40, 41, 41, 41, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + }, + { 0, 2, 2, 2, 3, 4, 5, 6, 7, 8, + 8, 9, 9, 10, 10, 10, 11, 12, 12, 13, + 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + }, + { 0, 4, 6, 6, 6, 7, 7, 8, 9, 9, + 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, + 19, 20, 20, 21, 22, 22, 23, 23, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + } + }, + { // ONT R10.4 duplex; just a copy of hifi for now + {10, 11, 11, 12, 13, 14, 15, 16, 18, 19, + 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, + 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 38, 39, 39, 40, 40, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + }, + { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, + { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + } + }, + { // Ultima Genomics + { 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, + 10, 10, 11, 12, 13, 14, 14, 15, 16, 17, + 18, 18, 19, 21, 22, 23, 23, 24, 25, 26, + 27, 27, 28, 29, 30, 31, 31, 32, 33, 34, + 35, 35, 36, 37, 38, 39, 39, 40, 42, 43, + 44, 44, 45, 46, 47, 48, 48, 49, 50, 51, + 52, 52, 53, 54, 55, 56, 56, 57, 58, 59, + 60, 60, 61, 63, 64, 65, 65, 66, 67, 68, + 69, 69, 70, 71, 72, 73, 73, 74, 75, 76, + 77, 77, 78, 79, 80, 81, 81, 82, 84, 85, + }, + { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, + 5, 5, 6, 6, 7, 7, 8, 8, 9, 10, + 10, 10, 11, 12, 13, 13, 13, 14, 15, 16, + 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 22, 22, 23, 23, 24, 24, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + }, + { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, + 5, 5, 6, 6, 7, 7, 8, 8, 9, 10, + 10, 10, 11, 12, 13, 13, 13, 14, 15, 16, + 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 22, 22, 23, 23, 24, 24, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + } + } +}; + +int set_qcal(qcal_t *q, int id) { + if (id < 0 || id >= sizeof(static_qcal)/sizeof(*static_qcal)) + return -1; + + memcpy(q, &static_qcal[id], sizeof(*q)); + return 0; +} + +int load_qcal(qcal_t *q, const char *fn) { int i; + if (strcmp(fn, ":hifi") == 0) + return set_qcal(q, QCAL_HIFI); + if (strcmp(fn, ":hiseq") == 0) + return set_qcal(q, QCAL_HISEQ); + if (strcmp(fn, ":r10.4_sup") == 0) + return set_qcal(q, QCAL_ONT_R10_4_SUP); + if (strcmp(fn, ":r10.4_dup") == 0) + return set_qcal(q, QCAL_ONT_R10_4_DUP); + if (strcmp(fn, ":ultima") == 0) + return set_qcal(q, QCAL_ULTIMA); + + // default + for (i = 0; i < 101; i++) + q->smap[i] = q->umap[i] = q->omap[i] = i; + + if (strcmp(fn, ":flat") == 0) + return 0; + + hFILE *fp = hopen(fn, "r"); + if (!fp) + return -1; + + kstring_t line = KS_INITIALIZE; + int max = 0; + int last_qual = 0; + while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) { + int v, s, u, o; + if (*line.s == '#') + continue; + if (sscanf(line.s, "QUAL %d %d %d %d", &v, &s, &u, &o) != 4) + goto err; + while (v > last_qual) { + q->smap[last_qual+1] = q->smap[last_qual]; + q->umap[last_qual+1] = q->umap[last_qual]; + q->omap[last_qual+1] = q->omap[last_qual]; + last_qual++; + } + if (v >= 0 && v < 100) { + q->smap[v] = s; + q->umap[v] = u; + q->omap[v] = o; + } + if (v < max) { + fprintf(stderr, "Qual calibration file is not in ascending order\n"); + return hclose(fp) ? -2 : -1; + } + max = v; + } + + for (i = max+1; i < 101; i++) { + q->smap[i] = q->smap[max]; + q->umap[i] = q->umap[max]; + q->omap[i] = q->omap[max]; + } + + ks_free(&line); + return hclose(fp) < 0 ? -2 : 0; + + err: + ks_free(&line); + return hclose(fp) < 0 ? -2 : -1; +} + +static void consensus_init(double p_het, double p_indel, double het_scale, + double poly_mul, + qcal_t *qcal, int mode, cons_probs *cp) { + int i; + + // NB: only need to initialise once, but we do here for now for (i = -500; i <= 500; i++) e_tab[i] = exp(i); for (i = -500; i <= 500; i++) @@ -296,43 +687,136 @@ static void consensus_init(double p_het) { for (i = 0; i <= 500; i++) e_log[i] = log(i); - // Heterozygous locations + // EXPERIMENTAL + cp->poly_mul = poly_mul; + + // The priors make very little difference, unless shallow data. + // ACGT* by ACGT* + // So AA=0, CC=6, GG=12, TT=18, **=24 for (i = 0; i < 25; i++) - prior[i] = p_het / 20; - prior[0] = prior[6] = prior[12] = prior[18] = prior[24] = (1-p_het)/5; - - lprior15[0] = log(prior[0]); - lprior15[1] = log(prior[1]*2); - lprior15[2] = log(prior[2]*2); - lprior15[3] = log(prior[3]*2); - lprior15[4] = log(prior[4]*2); - lprior15[5] = log(prior[6]); - lprior15[6] = log(prior[7]*2); - lprior15[7] = log(prior[8]*2); - lprior15[8] = log(prior[9]*2); - lprior15[9] = log(prior[12]); - lprior15[10] = log(prior[13]*2); - lprior15[11] = log(prior[14]*2); - lprior15[12] = log(prior[18]); - lprior15[13] = log(prior[19]*2); - lprior15[14] = log(prior[24]); - - - // Rewrite as new form + cp->prior[i] = p_het / 6; // AC AG AT CG CT GT + + // Flat assumption that it is what we observe, and measure everything else + // as relative to this. + cp->prior[0]=cp->prior[6]=cp->prior[12]=cp->prior[18]=cp->prior[24] = 1; + + // heterozygous deletion + for (i = 4; i < 24; i+=5) + cp->prior[i] = p_indel / 6; // /6 to be scaled vs p_het equivalently + + // heterozygous insertion + for (i = 20; i < 24; i++) + cp->prior[i] = p_indel / 6; + + cp->lprior15[0] = log(cp->prior[0]); + cp->lprior15[1] = log(cp->prior[1]); + cp->lprior15[2] = log(cp->prior[2]); + cp->lprior15[3] = log(cp->prior[3]); + cp->lprior15[4] = log(cp->prior[4]); + cp->lprior15[5] = log(cp->prior[6]); + cp->lprior15[6] = log(cp->prior[7]); + cp->lprior15[7] = log(cp->prior[8]); + cp->lprior15[8] = log(cp->prior[9]); + cp->lprior15[9] = log(cp->prior[12]); + cp->lprior15[10] = log(cp->prior[13]); + cp->lprior15[11] = log(cp->prior[14]); + cp->lprior15[12] = log(cp->prior[18]); + cp->lprior15[13] = log(cp->prior[19]); + cp->lprior15[14] = log(cp->prior[24]); + for (i = 1; i < 101; i++) { - double prob = 1 - pow(10, -i / 10.0); - - // May want to multiply all these by 5 so pMM[i] becomes close - // to -0 for most data. This makes the sums increment very slowly, - // keeping bit precision in the accumulator. - pMM[i] = log(prob/5); - p__[i] = log((1-prob)/20); - p_M[i] = log((exp(pMM[i]) + exp(p__[i]))/2); + double prob = 1 - pow(10, -qcal->smap[i] / 10.0); + + // Or is it that prob is 1-p(subst)-p(overcall)? + cp->pMM[i] = log(prob); + + //cp->p__[i] = log(1-prob); // Big help to PB-CCS SNPs; unless fudged + cp->p__[i] = log((1-prob)/3); // correct? poor on PB-CCS w/o fudge + + // Mixed alleles; just average two likelihoods + cp->p_M[i] = log((exp(cp->pMM[i]) + exp(cp->p__[i]))/2); + + // What does this really mean? Can we simulate this by priors? + // It reduces the likelihood of calling het sites, which is + // maybe compensation for alignment artifacts? I'm unsure, + // but it works (to differing degrees) on both PacBio HiFi and + // Illumina HiSeq. It (obviously) loses true hets, but + // potentially this can be compensated for by tweaking P-het + // (which is entirely in the priors). + // + // Low het_scale reduces false positives by making hets less + // likely to be called. In high depth data we normally have + // enough evidence to call correctly even with low het_scale, + // so it's a good +FN vs --FP tradeoff. However on low depth + // data, het_scale can filter out too many true variants. + // + // TODO: So consider adjusting at the end maybe? + // Also consider never changing calls, but changing their + // confidence, so the data is what produces the call with the + // parameters skewing the quality score distribution. + cp->p_M[i] += log(het_scale); + + if (mode == MODE_BAYES_116) { + // Compatibility with samtools 1.16 + + // This had no differention for indel vs substitution error rates, + // so o(vercall) and u(undercall) are subst(_). + cp->pmm[i] = cp->pMM[i]; + cp->poM[i] = cp->p_M[i]; + cp->pum[i] = cp->p_M[i]; + cp->po_[i] = cp->p__[i]; + cp->poo[i] = cp->p__[i]; + cp->puu[i] = cp->p__[i]; + + } else { + // When observing A C G T; leads to insertion calls + prob = 1 - pow(10, -qcal->omap[i] / 10.0); + // /3 for consistency with ACGT rem as relative likelihoods. + // Otherwise with flat priors we end up calling all shallow data + // as "*", which is illogical. + cp->poo[i] = log((1-prob)/3); + + // Ensure pMM is always more likely. (NB: This shouldn't happen + // now with the addition of the /3 step above.) + if (cp->poo[i] > cp->pMM[i]-.5) + cp->poo[i] = cp->pMM[i]-.5; + + cp->po_[i] = log((exp(cp->poo[i]) + exp(cp->p__[i]))/2); + cp->poM[i] = log((exp(cp->poo[i]) + exp(cp->pMM[i]))/2); + + // Overcalls should never be twice as likely than mismatches. + // Het bases are mix of _M (other) and MM ops (this). + // It's fine for _M to be less likely than oM (more likely + // to be overcalled than miscalled), but it should never + // be stronger when combined with other mixed data. + if (cp->poM[i] > cp->p_M[i]+.5) + cp->poM[i] = cp->p_M[i]+.5; + + // Note --low-MQ and --scale-MQ have a big impact on + // undercall errs. May need to separate these options per + // type, but how? + // Multiple-calls, as with mixed mode? This feels like a cheat + + prob = 1 - pow(10, -qcal->umap[i] / 10.0); + cp->pmm[i] = log(prob); + cp->puu[i] = log((1-prob)/3); + if (cp->puu[i] > cp->pMM[i]-.5) // MM is -ve + cp->puu[i] = cp->pMM[i]-.5; + + cp->pum[i] = log((exp(cp->puu[i]) + exp(cp->pmm[i]))/2); + } } - pMM[0] = pMM[1]; - p__[0] = p__[1]; - p_M[0] = p_M[1]; + cp->pMM[0] = cp->pMM[1]; + cp->p__[0] = cp->p__[1]; + cp->p_M[0] = cp->p_M[1]; + + cp->pmm[0] = cp->pmm[1]; + cp->poo[0] = cp->poo[1]; + cp->po_[0] = cp->po_[1]; + cp->poM[0] = cp->poM[1]; + cp->puu[0] = cp->puu[1]; + cp->pum[0] = cp->pum[1]; } static inline double fast_exp(double y) { @@ -380,6 +864,51 @@ int nins(const bam1_t *b){ return indel; } +/* + * Some machines, including 454 and PacBio, store the quality values in + * homopolymers with the first or last base always being the low quality + * state. This can cause problems when reverse-complementing and aligning, + * especially when we left-justify indels. + * + * Other platforms take the approach of having the middle bases high and + * the low confidence spread evenly to both start and end. This means + * reverse-complementing doesn't introduce any strand bias. + * + * We redistribute qualities within homopolymers in this style to fix + * naive consensus or variant calling algorithms. + */ +void homopoly_qual_fix(bam1_t *b) { + static double ph2err[256] = {0}; + int i; + if (!ph2err[0]) { + for (i = 0; i < 256; i++) + ph2err[i] = pow(10, i/-10.0); + } + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; i++) { + int s = i; // start of homopoly + int base = bam_seqi(seq, i); + while (i+1 < b->core.l_qseq && bam_seqi(seq, i+1) == base) + i++; + // s..i inclusive is now homopolymer + + if (s == i) + continue; + + // Simplest: reverse if end_qual < start_qual + // Next: average outer-most two, then next two, etc + // Best: fully redistribute so start/end lower qual than centre + + // Middle route of averaging outer pairs is sufficient? + int j, k; + for (j = s, k = i; j < k; j++,k--) { + double e = ph2err[qual[j]] + ph2err[qual[k]]; + qual[j] = qual[k] = -fast_log2(e/2)*3.0104+.49; + } + } +} + // Return the local NM figure within halo (+/- HALO) of pos. // This local NM is used as a way to modify MAPQ to get a localised MAPQ // score via an adhoc fashion. @@ -389,11 +918,22 @@ double nm_local(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { return 0; pos -= b->core.pos; if (pos < 0) - return nm[0]; + return nm[0] & ((1<<24)-1); if (pos >= b->core.l_qseq) - return nm[b->core.l_qseq-1]; + return nm[b->core.l_qseq-1] & ((1<<24)-1); + + return (nm[pos] & ((1<<24)-1)) / 10.0; +} - return nm[pos] / 10.0; +int poly_len(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { + int *nm = (int *)p->cd; + if (!nm) + return 0; + pos -= b->core.pos; + if (pos >= 0 && pos < b->core.l_qseq) + return nm[pos] >> 24; + else + return 0; } /* @@ -413,68 +953,91 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { const bam1_t *b = &p->b; int qlen = b->core.l_qseq, i; + if (qlen <= 0) + return 0; int *local_nm = calloc(qlen, sizeof(*local_nm)); if (!local_nm) return -1; p->cd = local_nm; + double poly_adj = opts->homopoly_fix ? opts->homopoly_fix : 1; + if (opts->adj_qual) { -#if 0 - // Tweak by localised quality. - // Quality is reduced by a significant portion of the minimum quality - // in neighbouring bases, on the pretext that if the region is bad, then - // this base is bad even if it claims otherwise. + // Set local_nm based on a function of current qual and the local + // minimum qual within the surrounding window. + // + // Basically if we're in a region of low confidence then we downgrade + // higher qual outliers as they may not be as trustworthy as they + // claim. This may be because the qualities have been assigned to + // the wrong or arbitrary base (very common in homopolymers), or the + // surrounding quality (hence also error likelihood) have lead to + // misalignments and the base may be contributing to the wrong + // pileup column. + // + // The nm_local() function returns these scores and uses it to bias + // the mapping quality, which in turn adjusts base quality. uint8_t *qual = bam_get_qual(b); - const int qhalo = 8; // 2? - int qmin = 50; // effectively caps PacBio qual too + uint8_t *seq = bam_get_seq(b); + const int qhalo = 8; // window size for base qual + int qmin = qual[0]; // min qual within qhalo + const int qhalop = 2;// window size for homopolymer qual + int qminp = qual[0]; // min qual within homopolymer halo + int base = bam_seqi(seq, 0), polyl = 0, polyr = 0; // pos, not len + + // Minimum quality of the initial homopolymer + for (i = 1; i < qlen; i++) { + if (bam_seqi(seq, i) != base) + break; + if (i < qhalop && qminp > qual[i]) + qminp = qual[i]; + } + + // Minimum quality for general bases for (i = 0; i < qlen && i < qhalo; i++) { - local_nm[i] = qual[i]; if (qmin > qual[i]) qmin = qual[i]; } + for (;i < qlen-qhalo; i++) { - //int t = (qual[i]*1 + 3*qmin)/4; // good on 60x - int t = (qual[i] + 5*qmin)/4; // good on 15x - local_nm[i] = t < qual[i] ? t : qual[i]; - if (qmin > qual[i+qhalo]) - qmin = qual[i+qhalo]; - else if (qmin <= qual[i-qhalo]) { + if (opts->homopoly_fix && bam_seqi(seq, i) != base) { + polyl = i; + base = bam_seqi(seq, i); + qminp = qual[i]; int j; - qmin = 50; - for (j = i-qhalo+1; j <= i+qhalo; j++) - if (qmin > qual[j]) - qmin = qual[j]; + for (j = i+1; j < qlen; j++) { + if (bam_seqi(seq, j) != base) + break; + if (i < qhalop && qminp > qual[j]) + qminp = qual[j]; + } + polyr = j-1; + } else { + // CHECK: do we want to have opts->homopoly_fix above, + // so when not in use we don't define pl to non-zero? + // Test on SynDip + polyr = polyl; } - } - for (; i < qlen; i++) { - local_nm[i] = qual[i]; - local_nm[i] = (local_nm[i] + 6*qmin)/4; - } + int pl = polyr-polyl; - for (i = 0; i < qlen; i++) { - qual[i] = local_nm[i]; + // Useful for SNPS, as we're judging the variation in + // length as an indicator for base-misalignment. + // Not so useful for indel calling where the longer the indel + // the less confident we are on the len variation being real. + int t = (opts->mode == MODE_BAYES_116) + ? (qual[i] + 5*qmin)/4 + : qual[i]/3 + (qminp-pl*2)*poly_adj; - // Plus overall rescale. - // Lower becomes lower, very high becomes a little higher. - // Helps deep GIAB, but detrimental elsewhere. (What this really - // indicates is quality calibration differs per data set.) - // It's probably something best accounted for somewhere else. - //qual[i] = qual[i]*qual[i]/40+1; - } - memset(local_nm, 0, qlen * sizeof(*local_nm)); -#else - // Skew local NM by qual vs min-qual delta - uint8_t *qual = bam_get_qual(b); - const int qhalo = 8; // 4 - int qmin = 99; - for (i = 0; i < qlen && i < qhalo; i++) { - if (qmin > qual[i]) - qmin = qual[i]; - } - for (;i < qlen-qhalo; i++) { - int t = (qual[i] + 5*qmin)/4; // good on 15x - local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + local_nm[i] += t < qual[i] ? qual[i]-t : 0; + + // Brute force qminp in polyl to polyr range. + // TODO: optimise this with sliding window + qminp = qual[i]; + int k; + for (k = MAX(polyl,i-qhalop); k <= MIN(polyr,i+qhalop); k++) + if (qminp > qual[k]) + qminp = qual[k]; + if (qmin > qual[i+qhalo]) qmin = qual[i+qhalo]; else if (qmin <= qual[i-qhalo]) { @@ -486,10 +1049,36 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { } } for (; i < qlen; i++) { - int t = (qual[i] + 5*qmin)/4; // good on 15x - local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + int t = (opts->mode == MODE_BAYES_116) + ? (qual[i] + 5*qmin)/4 + : qual[i]/3 + qminp*poly_adj; + local_nm[i] += t < qual[i] ? qual[i]-t : 0; } -#endif + } + + // Fix e.g. PacBio homopolymer qualities + if (opts->homopoly_fix) + homopoly_qual_fix((bam1_t *)b); + + // local_nm[i] & ((1<<24)-1) is for SNP score adjustment. + // We also put some more basic poly-X len in local_nm[i] >> 24. + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < qlen; i++) { + int base = bam_seqi(seq, i); + int poly = 0, j, k; + for (j = i+1; j < qlen; j++) + if (bam_seqi(seq, j) != base) + break; + //printf("%d x %d\n", base, j-i); + + poly = j-i-1; if (poly > 100) poly = 100; + const int HALO=0; + for (k = i-HALO; k < j+HALO; k++) + if (k >= 0 && k < qlen) + local_nm[k] = ((MAX(poly, local_nm[k]>>24))<<24) + | (local_nm[k] & ((1<<24)-1)); + + i = j-1; } // Adjust local_nm array by the number of edits within @@ -541,7 +1130,7 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { } // substitution - for (i = pos-halo*2 >= 0 ? pos-halo*2 : 0; i < pos-halo; i++) + for (i = pos-halo*2 >= 0 ?pos-halo*2 :0; i < pos-halo && i < qlen; i++) local_nm[i]+=5; for (; i < pos+halo && i < qlen; i++) local_nm[i]+=10; @@ -553,11 +1142,58 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { return 1; } +void nm_free(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { + free(p->cd); + p->cd = NULL; +} + +#ifdef DO_HDW +/* + * Stirling's formula with a 1/12n correction applied to improve accuracy. + * This seems to hold remarkably true for both low and high numbers too. + */ +double lnfact(double n) { + /* Or Gosper's formula... + * return (n*ln(n) - n + ln(2*M_PI*n + M_PI/3) / 2); + */ + return ((n+0.5)*log(n) - n + log(2*M_PI)/2) + log(1 + 1/(12.0*n)); + /* + log(1 + 1/(288.0*n*n)); */ +} + +/* + * The binomical coefficient (n,k) for n trials with k successes where + * prob(success) = p. + * k n-k + * P (k|n) = n! / (k! (n-k)!) p (1-p) + * p + * + * The coefficient we are returning here is the n! / (k! (n-k)!) bit. + * We compute it using ln(n!) and then exp() the result back to avoid + * excessively large numbers. + */ +double bincoef(int n, double k) { + return exp(lnfact(n) - lnfact(k) - lnfact(n-k)); +} + +/* + * Given p == 0.5 the binomial expansion simplifies a bit, so we have + * a dedicated function for this. + */ +double binprobhalf(int n, double k) { + return bincoef(n, k) * pow(0.5, n); +} + +double lnbinprobhalf(int n, double k) { + // ln(binprobhalf) expanded up and simplified + return lnfact(n) - lnfact(k) - lnfact(n-k) - 0.69315*n; +} +#endif static int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, pileup_t *plp, consensus_opts *opts, - consensus_t *cons, int default_qual) { + consensus_t *cons, int default_qual, + cons_probs *cp) { int i, j; static int init_done =0; static double q2p[101], mqual_pow[256]; @@ -571,8 +1207,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, // if it's rare. // Helps a bit on deep data, especially with K2=3, but detrimental on // shallow and (currently) quite a slow down. - -//#define K2 2 #ifdef K2 int hashN[1<<(K2*4+2)] = {0}; int hash1[1<<2] = {0}; @@ -594,7 +1228,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (!init_done) { init_done = 1; - consensus_init(opts->P_het); for (i = 0; i <= 100; i++) { q2p[i] = pow(10, -i/10.0); @@ -612,6 +1245,9 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, /* Initialise */ int counts[6] = {0}; +#ifdef DO_FRACT + int counts2[2][6] = {{0}}; +#endif /* Accumulate */ @@ -639,6 +1275,9 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, int td = depth; // original depth depth = 0; +#ifdef DO_POLY_DIST + int poly_dist[2][100] = {0}; +#endif for (; plp; plp = plp->next) { pileup_t *p = plp; @@ -660,7 +1299,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _; hb = (hb<<2)|base; } - // fprintf(stderr, "%c: %d %d of %d\t%d %d\n", p->base, hashN[hb], hash1[base1], td, p->qual, p->qual * hashN[hb] / hash1[base1]); #undef _ #endif @@ -688,7 +1326,7 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, // convert from sam base to acgt*n order. base = L[base]; - double MM, __, _M, qe; + double MM, __, _M, oo, oM, o_, uu, um, mm, qe; // Correction for mapping quality. Maybe speed up via lookups? // Cannot nullify mapping quality completely. Lots of (true) @@ -698,7 +1336,8 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (flags & CONS_MQUAL) { int mqual = b->core.qual; if (opts->nm_adjust) { - mqual /= (nm_local(p, b, pos)+1); + //mqual /= (nm_local(p, b, pos)+1); + mqual /= (nm_local(p, b, b->core.pos + p->seq_offset+1)+1); mqual *= 1 + 2*(0.5-(td>30?30:td)/60.0); // depth fudge } @@ -723,32 +1362,71 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (qual < 1) qual = 1; - __ = p__[qual]; // neither match - MM = pMM[qual] - __; // both match - _M = p_M[qual] - __; // one allele only (half match) + double poly = poly_len(p, b, b->core.pos + p->seq_offset+1); +#ifdef DO_POLY_DIST + poly_dist[bam_is_rev(b)][MIN(99,(int)poly)]++; +#endif + + // EXPERIMENTAL + // Adjust qual based on homopolymer length. + // Affects different platforms by differing amounts. + // May wish to further separate to qual2 and qual3 for ins and del? + int qual2 = MAX(1, qual-(poly-2)*cp->poly_mul); + + /* MM=match _M=half-match __=mismatch */ + __ = cp->p__[qual]; // neither match + MM = cp->pMM[qual] - __; // both match + _M = cp->p_M[qual] - __; // one allele only (half match) + + /* observation ACGT, but against hypothesis ** or *base */ + oo = cp->poo[qual2] - __; + oM = cp->poM[qual2] - __; + o_ = cp->po_[qual2] - __; + + /* observation * */ + uu = cp->puu[qual2] - __; + um = cp->pum[qual2] - __; + mm = cp->pmm[qual2] - __; if (flags & CONS_DISCREP) { qe = q2p[qual]; sumsC[base] += 1 - qe; } + counts[base]++; +#ifdef DO_FRACT + counts2[bam_is_rev(b)][base]++; +#endif + + // oM should never be higher than _M for actual bases! or... + //printf("base %d@%d MM %f _M %f oM %f\n", base, qual, MM, _M, oM); switch (base) { case 0: // A - S[0] += MM; - S[1] += _M; - S[2] += _M; - S[3] += _M; - S[4] += _M; + S[0] += MM; + S[1] += _M; + S[2] += _M; + S[3] += _M; + S[4] += oM; + S[8] += o_; + S[11] += o_; + S[13] += o_; + S[14] += oo; break; case 1: // C - S[1] += _M; - S[5] += MM; - S[6] += _M; - S[7] += _M; - S[8] += _M; + S[1] += _M; + S[5] += MM; + S[6] += _M; + S[7] += _M; + S[8] += oM; + S[4] += o_; + S[11] += o_; + S[13] += o_; + S[14] += oo; + + //fprintf(stderr, "%d %f %f %f\n", qual, MM+__, oo+__, MM-oo); break; case 2: // G @@ -756,55 +1434,125 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, S[ 6] += _M; S[ 9] += MM; S[10] += _M; - S[11] += _M; + S[11] += oM; + S[4] += o_; + S[8] += o_; + S[13] += o_; + S[14] += oo; break; case 3: // T - S[ 3] += _M; + S[ 3] += _M; // _m S[ 7] += _M; S[10] += _M; - S[12] += MM; - S[13] += _M; + S[12] += MM; // mm + S[13] += oM; + S[4] += o_; + S[8] += o_; + S[11] += o_; + S[14] += oo; + // S[14] oo break; case 4: // * - S[ 4] += _M; - S[ 8] += _M; - S[11] += _M; - S[13] += _M; - S[14] += MM; + // under under under under agree-no-base + S[0] += uu; S[1 ]+= uu; S[2 ]+= uu; S[3 ]+= uu; S[4 ]+= um; + S[5 ]+= uu; S[6 ]+= uu; S[7 ]+= uu; S[8 ]+= um; + S[9 ]+= uu; S[10]+= uu; S[11]+= um; + S[12]+= uu; S[13]+= um; + S[14]+= mm; break; case 5: /* N => equal weight to all A,C,G,T but not a pad */ - S[ 0] += MM; - S[ 1] += MM; - S[ 2] += MM; - S[ 3] += MM; - S[ 4] += _M; - - S[ 5] += MM; - S[ 6] += MM; - S[ 7] += MM; - S[ 8] += _M; - - S[ 9] += MM; - S[10] += MM; - S[11] += _M; - - S[12] += MM; - S[13] += _M; + S[0] += MM; S[1 ]+= MM; S[2 ]+= MM; S[3 ]+= MM; S[4 ]+= oM; + S[5 ]+= MM; S[6 ]+= MM; S[7 ]+= MM; S[8 ]+= oM; + S[9 ]+= MM; S[10]+= MM; S[11]+= oM; + S[12]+= MM; S[13]+= oM; + S[14]+= oo; break; } depth++; + } + +#ifdef DO_POLY_DIST + // Or compute mean and s.d per strand. + // Then compare likelihood of strands coming from the same distribution? + // eg s.d=0.59 vs mean=3.41 sd=0.54... hmm + // + // Or compare ratio of most frequent to next most frequent, for each + // strand. + + int d1 = 0, d2 = 0; + double nd1 = 0, nd2 = 0; + int k; + for (k = 0; k < 100; k++) { + if (!poly_dist[0][k] && !poly_dist[1][k]) + continue; - if (p->eof && p->cd) { - free(p->cd); - p->cd = NULL; +// fprintf(stdout, "%ld %d %2d %2d\n", pos, k, poly_dist[0][k], poly_dist[1][k]); + d1 += (k+1)*poly_dist[0][k]; + d2 += (k+1)*poly_dist[1][k]; + nd1 += poly_dist[0][k]; + nd2 += poly_dist[1][k]; + } +// printf("Avg = %f / %f %f / %f / %f\n", +// (d1+d2+1)/(nd1+nd2+1.), +// (d1+1)/(nd1+1.), (d2+1)/(nd2+1.), +// (d2+1)/(nd2+1.) - (d1+1)/(nd1+1.), +// ((d2+1)/(nd2+1.) - (d1+1)/(nd1+1.)) / ((d1+d2+1)/(nd1+nd2+1.))); + + // Find the top two frequent lengths + int n1 = 0, n2 = 0, l1 = 0, l2 = 0; + for (k = 0; k < 100; k++) { + int poly12 = poly_dist[0][k]+poly_dist[1][k]; + if (n1 < poly12) { + n2 = n1; l2 = l1; + n1 = poly12; + l1 = k; + } else if (n2 < poly12) { + n2 = poly12; + l2 = k; } } + const double N = 5; + nd1 += 1; + nd2 += 1; + + // l1 is most common length + int pn1p = poly_dist[0][l1]; + int pn1m = poly_dist[1][l1]; + // l2 2nd most common + int pn2p = poly_dist[0][l2]; + int pn2m = poly_dist[1][l2]; + + // ratio if two most common lengths on + + double s1 = (pn1p+N) / (pn2p+N); s1 = s1>1?1/s1:s1; + // ratio if two most common lengths on - + double s2 = (pn1m+N) / (pn2m+N); s2 = s2>1?1/s2:s2; + + // ratio of s1 and s2 to identify strand bias + double sbias = s1 / s2; sbias = sbias>1?1/sbias:sbias; + + if (pn2p+pn2m > 0 && l1 != l2) { +// printf("len %d,%d + %d,%d - %d,%d\tbias = %f %f, %f %f\t%ld\n", +// l1, l2, pn1p, pn2p, pn1m, pn2m, +// s1, s2, sbias, sqrt(sbias)-1, pos); + + // adjust score for het indels + // sbias is close to 0 for strong strand bias, and 1 for none + sbias = 10*log(sbias);//+.5); + S[ 4] += sbias; // A* + S[ 8] += sbias; // C* + S[11] += sbias; // G* + S[13] += sbias; // T* + } else { + sbias = 0; + } +#endif + /* We've accumulated stats, so now we speculate on the consensus call */ double shift, max, max_het, norm[15]; int call = 0, het_call = 0; @@ -822,8 +1570,87 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, max = -DBL_MAX; max_het = -DBL_MAX; +#ifdef DO_FRACT + // Filter by --min-depth and --het-fract. + // Also add a slight adjustment for strand bias. + for (j = 0; j < 15; j++) { + if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14) + continue; + + double c1p = counts2[0][map_het[j]%5]; + double c1m = counts2[1][map_het[j]%5]; + double c2p = counts2[0][map_het[j]/5]; + double c2m = counts2[1][map_het[j]/5]; + + double c1 = c1p + c1m; + double c2 = c2p + c2m; + + if (c1 && c2) { + // Slight decrease in confidence if strong strand bias. + const int N = 10; // avoid low sample size problems + double b1 = 1 - (N+MIN(c1p,c1m))/(N+MAX(c1p,c1m)); + double b2 = 1 - (N+MIN(c2p,c2m))/(N+MAX(c2p,c2m)); + if (b1 > 0.5) S[j] -= b1; + if (b2 > 0.5) S[j] -= b2; + + // Fraction based filtering, via --min-depth and --het-fract opts. + c1 += 1e-5; + c2 += 1e-5; + if (c2 > c1) { + double tmp = c2; + c2 = c1; + c1 = tmp; + } + + if (c2 < opts->min_depth) + S[j] -= 100; + if (c2 / (c1+1e-5) <= opts->het_fract) + S[j] -= 100; + } + } +#endif + +#ifdef DO_HDW + /* + * Apply Hardy-Weinberg statistics for heterozygous sites. + * This helps, but it also loses sensitivity a little. + */ for (j = 0; j < 15; j++) { - S[j] += lprior15[j]; + if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14) + continue; + + double c1 = counts[map_het[j]%5]; + double c2 = counts[map_het[j]/5]; + + if (c1 && c2) { + c1 += 1e-5; + c2 += 1e-5; + if (c2 > c1) { + double tmp = c2; + c2 = c1; + c1 = tmp; + } + + // Limit depth for HW as we'll have an allele freq difference, + // even if it's just caused by alignment reference bias. + double c12 = c1+c2; + if (c12 > 20) { + c2 *= 20/(c12); + c12 = 20; + c1 = 20-c2; + } + + // Helps a little, especially reducing FN deletions. + c1+=1; + c2+=1; + c12+=2; + S[j] += lnbinprobhalf(c12, c2) + fast_log2(c12)*0.69+.2; + } + } +#endif + + for (j = 0; j < 15; j++) { + S[j] += cp->lprior15[j]; if (shift < S[j]) shift = S[j]; @@ -912,6 +1739,84 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, return 0; } +// If opts->gap5 is MODE_MIXED then we use two different parameter +// sets, favouring cp_p for precision and cp_r for recall. Otherwise it's +// always cp_r only. +// +// When both calls equal, we return the same result. When they differ, +// we adjust qual based on accurate vs recall profiles. +int calculate_consensus_gap5m(hts_pos_t pos, int flags, int depth, + pileup_t *plp, consensus_opts *opts, + consensus_t *cons, int default_qual, + cons_probs *cp_r, cons_probs *cp_p) { + if (opts->mode != MODE_MIXED) + return calculate_consensus_gap5(pos, flags, depth, plp, opts, + cons, default_qual, + opts->mode == MODE_PRECISE + ? cp_p : cp_r); + + // EXPERIMENTAL: mixed mode + consensus_t consP, consR; + // Favours precision + calculate_consensus_gap5(pos, flags, depth, plp, opts, + &consP, default_qual, cp_p); + // Favours recall + calculate_consensus_gap5(pos, flags, depth, plp, opts, + &consR, default_qual, cp_r); + +#define MIN(a,b) ((a)<(b)?(a):(b)) +#define MAX(a,b) ((a)>(b)?(a):(b)) + + // Initial starting point is precise mode + memcpy(cons, &consP, sizeof(consP)); + + if (consP.phred > 0 && consR.phred > 0 && consP.call == consR.call) { + // Both strategies match as HOM + // Boost qual as both in agreement + cons->phred += MIN(20, consR.phred); + + } else if (consP.het_logodd >= 0 && consR.het_logodd >= 0 && + consP.het_call == consR.het_call) { + // Both strategies match as HET + // Boost qual as both in agreement + cons->het_logodd += MIN(20, consR.het_logodd); + + } else if (consP.het_logodd >= 0) { + // Accurate method claims heterozygous, so go with it. + // However sensitive method disagrees, so reduce qual a little. + int q2 = MAX(consR.phred, consR.het_logodd); + cons->het_logodd = MAX(1, (cons->het_logodd - q2/2)); + + } else if (consR.het_logodd >= 70) { + // Accurate is homozygous and consR is het, so we go with it instead + // but at a lower quality value. + // TODO: may wish to check HET is consistent with HOM? Very unlikely + // not to be though. + int q1 = consP.phred; + int q2 = consR.het_logodd; + memcpy(cons, &consR, sizeof(consR)); + cons->het_logodd = MIN(15, MAX((q2-q1*2)/2, 1+q2/(q1+1.0))); + + } else if (consR.het_logodd >= 0) { + // As above, but low quality + int q1 = consP.phred; + int q2 = consR.het_logodd; + memcpy(cons, &consR, sizeof(consR)); + cons->het_logodd = MAX(1,q2 - 0.3*q1) + + 5*(consP.het_call == consR.het_call); + cons->phred = 0; + + } else if (consR.het_logodd < 0) { + // Neither are heterozygous, but differing in phred call (V rare). + // Pick highest qual, after some scaling? + consR.phred = consR.phred / 2; + if (consR.phred > consP.phred) + memcpy(cons, &consR, sizeof(consR)); + cons->phred = MAX(10, cons->phred); + } + + return 0; +} /* -------------------------------------------------------------------------- * Main processing logic @@ -973,12 +1878,12 @@ static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { * standard pileup criteria (eg COG-UK / CLIMB Covid-19 seq project). * * - * call1 / score1 / depth1 is the highest scoring allele. - * call2 / score2 / depth2 is the second highest scoring allele. + * call1 / score1 is the highest scoring allele. + * call2 / score2 is the second highest scoring allele. * * Het_fract: score2/score1 * Call_fract: score1 or score1+score2 over total score - * Min_depth: minimum total depth of utilised bases (depth1+depth2) + * Min_depth: minimum total depth of unfiltered bases (above qual/mqual) * Min_score: minimum total score of utilised bases (score1+score2) * * Eg het_fract 0.66, call_fract 0.75 and min_depth 10. @@ -999,6 +1904,7 @@ static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { static int calculate_consensus_simple(const pileup_t *plp, consensus_opts *opts, int *qual) { int i, min_qual = opts->min_qual; + int tot_depth = 0; // Map "seqi" nt16 to A,C,G,T compatibility with weights on pure bases. // where seqi is A | (C<<1) | (G<<2) | (T<<3) @@ -1049,6 +1955,7 @@ static int calculate_consensus_simple(const pileup_t *plp, freq[16] ++; score[16]+=8 * (opts->use_qual ? q : 1); } + tot_depth++; } // Total usable depth @@ -1058,19 +1965,15 @@ static int calculate_consensus_simple(const pileup_t *plp, // Best and second best potential calls int call1 = 15, call2 = 15; - int depth1 = 0, depth2 = 0; int score1 = 0, score2 = 0; for (i = 0; i < 5; i++) { int c = 1<= opts->het_fract * score1 && opts->ambig) { used_base |= call2; used_score += score2; - used_depth += depth2; } // N is too shallow, or insufficient proportion of total - if (used_depth < opts->min_depth || + if (tot_depth < opts->min_depth || used_score < opts->call_fract * tscore) { - used_depth = 0; // But note shallow gaps are still called gaps, not N, as // we're still more confident there is no base than it is // A, C, G or T. - used_base = call1 == 16 /*&& depth1 >= call_fract * depth*/ - ? 16 : 0; // * or N + used_base = call1 == 16 ? 16 : 0; // * or N } // Our final call. "?" shouldn't be possible to generate @@ -1102,7 +2001,7 @@ static int calculate_consensus_simple(const pileup_t *plp, "NACMGRSVTWYHKDBN" "*ac?g???t???????"; - //printf("%c %d\n", het[used_base], used_depth); + //printf("%c %d\n", het[used_base], tot_depth); if (qual) *qual = used_base ? 100.0 * used_score / tscore : 0; @@ -1169,10 +2068,11 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } } - if (opts->gap5) { + if (opts->mode != MODE_SIMPLE) { consensus_t cons; - calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual); + calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual, + &cons_prob_recall, &cons_prob_precise); if (cons.het_logodd > 0 && opts->ambig) { cb = "AMRWa" // 5x5 matrix with ACGT* per row / col "MCSYc" @@ -1307,10 +2207,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } // share this with basic_pileup - if (opts->gap5) { + if (opts->mode != MODE_SIMPLE) { consensus_t cons; - calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual); + calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual, + &cons_prob_recall, &cons_prob_precise); if (cons.het_logodd > 0 && opts->ambig) { cb = "AMRWa" // 5x5 matrix with ACGT* per row / col "MCSYc" @@ -1344,6 +2245,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, opts->last_tid = tid; return 0; } + if (opts->mark_ins && nth && cb != '*') { + kputc('_', seq); + kputc('_', qual); + } + // end of share // Append consensus base/qual to seqs @@ -1373,6 +2279,7 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return 0; } + // END OF NEW PILEUP //--------------------------------------------------------------------------- @@ -1391,14 +2298,16 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " Exclude reads with any flag bit set\n"); fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); fprintf(fp, " --min-MQ INT Exclude reads with mapping quality below INT [0]\n"); + fprintf(fp, " --min-BQ INT Exclude reads with base quality below INT [0]\n"); fprintf(fp, " --show-del yes/no Whether to show deletion as \"*\" [no]\n"); fprintf(fp, " --show-ins yes/no Whether to show insertions [yes]\n"); + fprintf(fp, " --mark-ins Add '+' before every inserted base/qual [off]\n"); fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n"); fprintf(fp, "\nFor simple consensus mode:\n"); fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n"); fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n"); - fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n"); - fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.5]\n"); + fprintf(fp, " -d, --min-depth INT Minimum depth of INT [2]\n"); + fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.15]\n"); fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n"); fprintf(fp, " -C, --cutoff C Consensus cutoff quality C [10]\n"); fprintf(fp, " --(no-)adj-qual Modify quality with local minima [on]\n"); @@ -1410,6 +2319,18 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " --high-MQ INT Cap maximum mapping quality [60]\n"); fprintf(fp, " --P-het FLOAT Probability of heterozygous site[%.1e]\n", P_HET); + fprintf(fp, " --P-indel FLOAT Probability of indel sites[%.1e]\n", + P_INDEL); + fprintf(fp, " --het-scale FLOAT Heterozygous SNP probability multiplier[%.1e]\n", + P_HET_SCALE); + fprintf(fp, " -p, --homopoly-fix Spread low-qual bases to both ends of homopolymers\n"); + fprintf(fp, " --homopoly-score FLOAT\n" + " Qual fraction adjustment for -p option [%g]\n", P_HOMOPOLY); + fprintf(fp, " -t, --qual-calibration FILE / :config (see man page)\n"); + fprintf(fp, " Load quality calibration file\n"); + fprintf(fp, "\n"); + fprintf(fp, " -X, --config STR Use pre-defined configuration set. STR from:\n"); + fprintf(fp, " hiseq, hifi, r10.4_sup, r10.4_dup and ultima\n"); fprintf(fp, "\nGlobal options:\n"); sam_global_opt_help(fp, "-.---@-."); @@ -1421,7 +2342,7 @@ int main_consensus(int argc, char **argv) { consensus_opts opts = { // User options - .gap5 = 1, + .mode = MODE_RECALL, .use_qual = 0, .min_qual = 0, .adj_qual = 1, @@ -1444,10 +2365,15 @@ int main_consensus(int argc, char **argv) { .all_bases = 0, .show_del = 0, .show_ins = 1, + .mark_ins = 0, .incl_flags = 0, .excl_flags = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, .min_mqual = 0, .P_het = P_HET, + .P_indel = P_INDEL, + .het_scale = P_HET_SCALE, + .homopoly_fix = 0, + .homopoly_redux = 0.01, // Internal state .ks_line = {0,0}, @@ -1461,6 +2387,8 @@ int main_consensus(int argc, char **argv) { .last_pos = -1, }; + set_qcal(&opts.qcal, QCAL_FLAT); + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), @@ -1489,18 +2417,27 @@ int main_consensus(int argc, char **argv) { {"het-only", no_argument, NULL, 6}, {"show-del", required_argument, NULL, 7}, {"show-ins", required_argument, NULL, 8}, + {"mark-ins", no_argument, NULL, 18}, {"output", required_argument, NULL, 'o'}, {"incl-flags", required_argument, NULL, 11}, {"rf", required_argument, NULL, 11}, {"excl-flags", required_argument, NULL, 12}, {"ff", required_argument, NULL, 12}, {"min-MQ", required_argument, NULL, 13}, + {"min-BQ", required_argument, NULL, 16}, {"P-het", required_argument, NULL, 15}, + {"P-indel", required_argument, NULL, 17}, + {"het-scale", required_argument, NULL, 19}, {"mode", required_argument, NULL, 'm'}, + {"homopoly-fix", no_argument, NULL, 'p'}, + {"homopoly-score", required_argument, NULL, 'p'+100}, + {"homopoly-redux", required_argument, NULL, 'p'+200}, + {"qual-calibration", required_argument, NULL, 't'}, + {"config", required_argument, NULL, 'X'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:", + while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:", lopts, NULL)) >= 0) { switch (c) { case 'a': opts.all_bases++; break; @@ -1517,12 +2454,21 @@ int main_consensus(int argc, char **argv) { case 'r': opts.reg = optarg; break; case 'C': opts.cons_cutoff = atoi(optarg); break; case 'A': opts.ambig = 1; break; + case 'p': opts.homopoly_fix = P_HOMOPOLY; break; + case 'p'+100: opts.homopoly_fix = atof(optarg); break; + case 'p'+200: + // EXPERIMENTAL + opts.homopoly_redux = atof(optarg); break; case 1: opts.default_qual = atoi(optarg); break; case 6: opts.het_only = 1; break; case 7: opts.show_del = (*optarg == 'y' || *optarg == 'Y'); break; case 8: opts.show_ins = (*optarg == 'y' || *optarg == 'Y'); break; + case 18: opts.mark_ins = 1; break; case 13: opts.min_mqual = atoi(optarg); break; + case 16: opts.min_qual = atoi(optarg); break; case 15: opts.P_het = atof(optarg); break; + case 17: opts.P_indel = atof(optarg); break; + case 19: opts.het_scale = atof(optarg); break; case 'q'+100: opts.adj_qual = 1; break; case 'q'+101: opts.adj_qual = 0; break; case 'm'+100: opts.nm_adjust = 1; break; @@ -1532,9 +2478,22 @@ int main_consensus(int argc, char **argv) { case 'm': // mode if (strcasecmp(optarg, "simple") == 0) { - opts.gap5 = 0; - } else if (strcasecmp(optarg, "bayesian") == 0) { - opts.gap5 = 1; + opts.mode = MODE_SIMPLE; + } else if (strcasecmp(optarg, "bayesian_m") == 0) { + // EXPERIMENTAL: + // A mixture of modified precise/recall params and a + // blending of the two. Sometimes helps a bit. + opts.mode = MODE_MIXED; + } else if (strcasecmp(optarg, "bayesian_p") == 0) { + // EXPERIMENTAL: + // favours precision + opts.mode = MODE_PRECISE; + } else if (strcasecmp(optarg, "bayesian_r") == 0 || + strcasecmp(optarg, "bayesian") == 0) { + // favours recall; the default + opts.mode = MODE_RECALL; + } else if (strcasecmp(optarg, "bayesian_116") == 0) { + opts.mode = MODE_BAYES_116; } else { fprintf(stderr, "Unknown mode %s\n", optarg); return 1; @@ -1566,6 +2525,67 @@ int main_consensus(int argc, char **argv) { } break; + case 'X': + if (strcasecmp(optarg, "hifi") == 0) { + set_qcal(&opts.qcal, QCAL_HIFI); + opts.mode = MODE_RECALL; + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.low_mqual = 5; + opts.scale_mqual = 1.5; + opts.het_scale = 0.37; + } else if (strcasecmp(optarg, "hiseq") == 0) { + opts.mode = MODE_RECALL; + set_qcal(&opts.qcal, QCAL_HISEQ); + opts.homopoly_redux = 0.01; + } else if (strcasecmp(optarg, "r10.4_sup") == 0) { + // Same as HiFi params, but ONT calibration table. + // At higher depth, hifi params work well for ONT + // when combined with ONT calibration chart. + // + // At lower depth we gain a bit from increasing homopoly_redux + set_qcal(&opts.qcal, QCAL_ONT_R10_4_SUP); + opts.mode = MODE_RECALL; + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.low_mqual = 5; + opts.scale_mqual = 1.5; + opts.het_scale = 0.37; + + // Also consider, for lower depth: + // opts.homopoly_redux = 1; + // opts.scale_mqual = 1; + // opts.het_scale = 0.45; + } else if (strcasecmp(optarg, "r10.4_dup") == 0) { + // Just a copy of of HiFi for duplex currently until + // we get a good truth set for calibration. + set_qcal(&opts.qcal, QCAL_ONT_R10_4_DUP); + opts.mode = MODE_RECALL; + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.low_mqual = 5; + opts.scale_mqual = 1.5; + opts.het_scale = 0.37; + } else if (strcasecmp(optarg, "ultima") == 0) { + // Very similar to HiFi, but with own calibration table + opts.mode = MODE_RECALL; + set_qcal(&opts.qcal, QCAL_ULTIMA); + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.het_scale = 0.37; + opts.scale_mqual = 2; + opts.low_mqual = 10; + } else { + // NB consider defaults that are a mixture of all above. + // Options are all similar for all bar Illumina. + // Unsure what :flat calibration table does to each of + // these though. + fprintf(stderr, "Unrecognised configuration name: \"%s\"\n", + optarg); + return 1; + } + break; + case 11: if ((opts.incl_flags = bam_str2flag(optarg)) < 0) { print_error("consensus", "could not parse --rf %s", optarg); @@ -1579,6 +2599,15 @@ int main_consensus(int argc, char **argv) { } break; + case 't': // --qual-calibration + if (load_qcal(&opts.qcal, optarg) < 0) { + print_error("consensus", + "failed to load quality calibration '%s'", + optarg); + return -1; + } + break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': @@ -1586,6 +2615,44 @@ int main_consensus(int argc, char **argv) { } } +#if 0 + // Dump out the qcal table. Useful for copying into the code above. + int i; + qcal_t *q = &opts.qcal; + fprintf(stderr, "{"); + for (i = 0; i < 100; i++) + fprintf(stderr, "%2d,%s", q->smap[i],(i+1)%10?" ":"\n"); + fprintf(stderr, "},\n{"); + for (i = 0; i < 100; i++) + fprintf(stderr, "%2d,%s", q->umap[i],(i+1)%10?" ":"\n"); + fprintf(stderr, "},\n{"); + for (i = 0; i < 100; i++) + fprintf(stderr, "%2d,%s", q->omap[i],(i+1)%10?" ":"\n"); + fprintf(stderr, "}\n"); +#endif + + if (opts.mode != MODE_SIMPLE) { + if (opts.mode == MODE_PRECISE) + // More accuracy / precision, but a significant drop + // in recall. + consensus_init(opts.P_het, opts.P_indel, + 0.3 * opts.het_scale, opts.homopoly_redux, + &opts.qcal, MODE_PRECISE, &cons_prob_precise); + + if (opts.mode == MODE_MIXED) + // Blend these in when running in mixed mode, so we can + // keep sensitivity but have a better joint quality to + // reduce the FP rate. + consensus_init(pow(opts.P_het, 0.7), pow(opts.P_indel, 0.7), + 0.3 * opts.het_scale, opts.homopoly_redux, + &opts.qcal, MODE_PRECISE, &cons_prob_precise); + + // Better recall, at a cost of some accuracy (false positives) + consensus_init(opts.P_het, opts.P_indel, opts.het_scale, + opts.mode == MODE_RECALL ? opts.homopoly_redux : 0.01, + &opts.qcal, MODE_RECALL, &cons_prob_recall); + } + if (argc != optind+1) { if (argc == optind) usage_exit(stdout, EXIT_SUCCESS); else usage_exit(stderr, EXIT_FAILURE); @@ -1625,8 +2692,11 @@ int main_consensus(int argc, char **argv) { } if (opts.fmt == PILEUP) { - if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, - basic_pileup, &opts) < 0) + if (pileup_loop(opts.fp, opts.h, readaln2, + opts.mode != MODE_SIMPLE ? nm_init : NULL, + basic_pileup, + opts.mode != MODE_SIMPLE ? nm_free : NULL, + &opts) < 0) goto err; if (opts.all_bases) { @@ -1641,8 +2711,10 @@ int main_consensus(int argc, char **argv) { goto err; } } else { - if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, + if (pileup_loop(opts.fp, opts.h, readaln2, + opts.mode != MODE_SIMPLE ? nm_init : NULL, basic_fasta, + opts.mode != MODE_SIMPLE ? nm_free : NULL, &opts) < 0) goto err; if (opts.all_bases) { diff --git a/samtools/bam_consensus.c.pysam.c b/samtools/bam_consensus.c.pysam.c index 08536c6..70f47ba 100644 --- a/samtools/bam_consensus.c.pysam.c +++ b/samtools/bam_consensus.c.pysam.c @@ -3,7 +3,7 @@ /* bam_consensus.c -- consensus subcommand. Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source) - Copyright (C) 2003-2005,2007-2022 Genome Research Ltd. + Copyright (C) 2003-2005,2007-2023 Genome Research Ltd. Author: James Bonfield @@ -101,6 +101,30 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // but 30T+ 20T- 18A+ 2A- seems like a consistent A miscall on one strand // only, while T is spread evenly across both strands. +// TODO: Phasing of long reads. +// Long reads offer very strong phasing opportunities for SNPs. +// From these, we get strong evidence for accuracy of indels. +// Specifically whether the distribution of poly-len within a phases +// is significantly different to the distribution of poly len between +// phases. + +// TODO end STR trimming. Eg: +// REF AAGCTGAAAAGTTAATGTCTTATTTTTTTTTTTTTTTTGAGATGGAGTC +// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc +// aagctgaaaagttaatgtcttattttttttt +// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc +// Middle seq doesn't validate those initial T alignments. +// Qual_train solves this by use of the STR trimmer. + +// TODO add a weight for proximity to homopolymer. +// Maybe length/distance? So 3 away from a 12-mer is similar to 1 away +// from a 4-mer? + +// TODO: Count number of base types between this point and the nearest +// indel or end of read. Eg GATCGAGAGAG*TAGC => 2 (A and G). +// adj is nbase/4 * score, or (nbase+1)/5? +// Perhaps multiplied by length too, to get local complexity score? + #include #include @@ -112,6 +136,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include "samtools.h" #include "sam_opts.h" @@ -131,6 +156,21 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # define MAX(a,b) ((a)>(b)?(a):(b)) #endif +// Defines for experiment code which is currently disabled + +// Hardy-Weinberg statistics to check heterozygous sites match allelic +// frequencies. +//#define DO_HDW + +// Filter bayesian calls by min-depth and min-fract parameters +//#define DO_FRACT + +// Checks uniqueness of surrounding bases to adjust scores +//#define K2 2 + +// Look for strand bias in distribution of homopolymer lengths +//#define DO_POLY_DIST + // Minimum cutoff for storing mod data; => at least 10% chance #define MOD_CUTOFF 0.46 @@ -142,6 +182,14 @@ enum format { typedef unsigned char uc; +// Simple recalibration table for substitutions, undercalls and overcalls. +// In future, we'll update this to be kmer based too. +typedef struct { + int smap[101]; // substituion or SNP + int umap[101]; // undercall or DEL + int omap[101]; // overcall or INS +} qcal_t; + typedef struct { // User options char *reg; @@ -158,7 +206,7 @@ typedef struct { int min_depth; double call_fract; double het_fract; - int gap5; + int mode; // One of MODE_* macros below enum format fmt; int cons_cutoff; int ambig; @@ -168,10 +216,16 @@ typedef struct { int all_bases; int show_del; int show_ins; + int mark_ins; int excl_flags; int incl_flags; int min_mqual; double P_het; + double P_indel; + double het_scale; + double homopoly_fix; + double homopoly_redux; + qcal_t qcal; // Internal state samFile *fp; @@ -223,7 +277,10 @@ typedef struct { float discrep; } consensus_t; -#define P_HET 1e-4 +#define P_HET 1e-3 +#define P_INDEL 2e-4 +#define P_HOMOPOLY 0.5 +#define P_HET_SCALE 1.0 #define LOG10 2.30258509299404568401 #define TENOVERLOG10 4.34294481903251827652 @@ -235,20 +292,38 @@ typedef struct { #define ALIGNED(x) #endif -static double prior[25] ALIGNED(16); /* Sum to 1.0 */ -static double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */ - -/* Precomputed matrices for the consensus algorithm */ -static double pMM[101] ALIGNED(16); -static double p__[101] ALIGNED(16); -static double p_M[101] ALIGNED(16); - +// Initialised once as a global array. This won't work if threaded, +// but we'll rewrite if and when that gets added later. static double e_tab_a[1002] ALIGNED(16); static double *e_tab = &e_tab_a[500]; static double e_tab2_a[1002] ALIGNED(16); static double *e_tab2 = &e_tab2_a[500]; static double e_log[501] ALIGNED(16); +/* Precomputed matrices for the consensus algorithm */ +typedef struct { + double prior[25] ALIGNED(16); /* Sum to 1.0 */ + double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */ + + double pMM[101] ALIGNED(16); + double p__[101] ALIGNED(16); + double p_M[101] ALIGNED(16); + double po_[101] ALIGNED(16); + double poM[101] ALIGNED(16); + double poo[101] ALIGNED(16); + double puu[101] ALIGNED(16); + double pum[101] ALIGNED(16); + double pmm[101] ALIGNED(16); + + // Multiplier on homopolymer length before reducing phred qual + double poly_mul; +} cons_probs; + +// Two sets of params; recall oriented (gap5) and precision (stf). +// We use the former unless MODE_MIXED is set (which is the default +// for bayesian consensus mode if P_indel is significant). +static cons_probs cons_prob_recall, cons_prob_precise; + /* * Lots of confusing matrix terms here, so some definitions will help. * @@ -286,11 +361,327 @@ static double e_log[501] ALIGNED(16); * The heterozygosity weight though is a per column calculation as we're * trying to model whether the column is pure or mixed. Hence this is done * once via a prior and has no affect on the individual matrix cells. + * + * We have a generic indel probability, but it's a catch all for overcall, + * undercall, alignment artifacts, homopolymer issues, etc. So we can set + * it considerably higher and just let the QUAL skew do the filtering for + * us, albeit no longer well calibrated. */ -static void consensus_init(double p_het) { +// NB: Should _M be MM? +// Ie sample really is A/C het, and we observe C. That should be a match, +// not half a match. + +#define MODE_SIMPLE 0 // freq counting + +#define MODE_BAYES_116 1 // Samtools 1.16 (no indel param) +#define MODE_RECALL 2 // so called as it's the params from Gap5 +#define MODE_PRECISE 3 // a more precise set; +FN, --FP +#define MODE_MIXED 4 // Combination of GAP5/BAYES + +#define QCAL_FLAT 0 +#define QCAL_HIFI 1 +#define QCAL_HISEQ 2 +#define QCAL_ONT_R10_4_SUP 3 +#define QCAL_ONT_R10_4_DUP 4 +#define QCAL_ULTIMA 5 + +// Calibration tables here don't necessarily reflect the true accuracy. +// They have been manually tuned to work in conjunction with other command +// line parameters used in the machine profiles. For example reducing one +// qual here and increasing sensitivity elsewhere via another parameter. +static qcal_t static_qcal[6] = { + { // FLAT + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, 99} + }, + + { // HiFi + {10, 11, 11, 12, 13, 14, 15, 16, 18, 19, + 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, + 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 38, 39, 39, 40, 40, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + }, + { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, + { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + } + }, + + { // HiSeq + { 2, 2, 2, 3, 3, 4, 5, 5, 6, 7, + 8, 9, 10, 11, 11, 12, 13, 14, 15, 16, + 17, 17, 18, 19, 20, 21, 22, 22, 23, 24, + 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, + 34, 34, 35, 36, 37, 38, 39, 39, 40, 41, + 42, 43, 44, 45, 45, 46, 47, 48, 49, 50, + 51, 51, 52, 53, 54, 55, 56, 56, 57, 58, + 59, 60, 61, 62, 62, 63, 64, 65, 66, 67, + 68, 68, 69, 70, 71, 72, 73, 73, 74, 75, + 76, 77, 78, 79, 79, 80, 81, 82, 83, 84, + }, + { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, + 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, + 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, + 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, + 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, + 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, + 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, + 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, + 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, + 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, + }, + { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, + 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, + 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, + 37, 38, 39, 40, 41, 43, 44, 45, 46, 47, + 49, 50, 51, 52, 53, 55, 56, 57, 58, 59, + 61, 62, 63, 64, 65, 67, 68, 69, 70, 71, + 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, + 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, + 97, 98, 99, 100, 101, 103, 104, 105, 106, 107, + 109, 110, 111, 112, 113, 115, 116, 117, 118, 119, + } + }, + { // ONT R10.4 super + { 0, 2, 2, 2, 3, 4, 4, 5, 6, 7, + 7, 8, 9, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 20, 22, 24, 25, 26, 27, 28, 29, + 30, 31, 33, 34, 36, 37, 38, 38, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 41, 40, 40, + 41, 41, 40, 40, 40, 40, 41, 40, 40, 40, + 40, 41, 41, 40, 40, 41, 40, 40, 39, 41, + 40, 41, 40, 40, 41, 41, 41, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + }, + { 0, 2, 2, 2, 3, 4, 5, 6, 7, 8, + 8, 9, 9, 10, 10, 10, 11, 12, 12, 13, + 13, 13, 14, 14, 15, 16, 16, 17, 18, 18, + 19, 19, 20, 21, 22, 23, 24, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + }, + { 0, 4, 6, 6, 6, 7, 7, 8, 9, 9, + 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, + 19, 20, 20, 21, 22, 22, 23, 23, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + } + }, + { // ONT R10.4 duplex; just a copy of hifi for now + {10, 11, 11, 12, 13, 14, 15, 16, 18, 19, + 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, + 31, 32, 33, 33, 34, 35, 36, 36, 37, 38, + 38, 39, 39, 40, 40, 41, 41, 41, 41, 42, + 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, + }, + { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, + 10, 11, 11, 12, 13, 14, 15, 15, 16, 17, + 18, 19, 19, 20, 20, 21, 22, 23, 23, 24, + 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, + 28, 28, 27, 27, 27, 28, 28, 28, 28, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 26, 26, 25, 26, 26, 27, 27, 27, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, + 28, 29, 28, 28, 28, 27, 27, 27, 27, 27, + 27, 28, 28, 30, 30, 30, 30, 30, 30, 30, + }, + { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, + 15, 15, 16, 17, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 23, 23, 23, 24, 24, 24, 25, + 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, + 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, + 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, + } + }, + { // Ultima Genomics + { 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, + 10, 10, 11, 12, 13, 14, 14, 15, 16, 17, + 18, 18, 19, 21, 22, 23, 23, 24, 25, 26, + 27, 27, 28, 29, 30, 31, 31, 32, 33, 34, + 35, 35, 36, 37, 38, 39, 39, 40, 42, 43, + 44, 44, 45, 46, 47, 48, 48, 49, 50, 51, + 52, 52, 53, 54, 55, 56, 56, 57, 58, 59, + 60, 60, 61, 63, 64, 65, 65, 66, 67, 68, + 69, 69, 70, 71, 72, 73, 73, 74, 75, 76, + 77, 77, 78, 79, 80, 81, 81, 82, 84, 85, + }, + { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, + 5, 5, 6, 6, 7, 7, 8, 8, 9, 10, + 10, 10, 11, 12, 13, 13, 13, 14, 15, 16, + 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 22, 22, 23, 23, 24, 24, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + }, + { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, + 5, 5, 6, 6, 7, 7, 8, 8, 9, 10, + 10, 10, 11, 12, 13, 13, 13, 14, 15, 16, + 16, 16, 17, 18, 18, 19, 19, 20, 20, 21, + 21, 22, 22, 22, 22, 23, 23, 24, 24, 25, + 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + } + } +}; + +int set_qcal(qcal_t *q, int id) { + if (id < 0 || id >= sizeof(static_qcal)/sizeof(*static_qcal)) + return -1; + + memcpy(q, &static_qcal[id], sizeof(*q)); + return 0; +} + +int load_qcal(qcal_t *q, const char *fn) { int i; + if (strcmp(fn, ":hifi") == 0) + return set_qcal(q, QCAL_HIFI); + if (strcmp(fn, ":hiseq") == 0) + return set_qcal(q, QCAL_HISEQ); + if (strcmp(fn, ":r10.4_sup") == 0) + return set_qcal(q, QCAL_ONT_R10_4_SUP); + if (strcmp(fn, ":r10.4_dup") == 0) + return set_qcal(q, QCAL_ONT_R10_4_DUP); + if (strcmp(fn, ":ultima") == 0) + return set_qcal(q, QCAL_ULTIMA); + + // default + for (i = 0; i < 101; i++) + q->smap[i] = q->umap[i] = q->omap[i] = i; + + if (strcmp(fn, ":flat") == 0) + return 0; + + hFILE *fp = hopen(fn, "r"); + if (!fp) + return -1; + + kstring_t line = KS_INITIALIZE; + int max = 0; + int last_qual = 0; + while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) { + int v, s, u, o; + if (*line.s == '#') + continue; + if (sscanf(line.s, "QUAL %d %d %d %d", &v, &s, &u, &o) != 4) + goto err; + while (v > last_qual) { + q->smap[last_qual+1] = q->smap[last_qual]; + q->umap[last_qual+1] = q->umap[last_qual]; + q->omap[last_qual+1] = q->omap[last_qual]; + last_qual++; + } + if (v >= 0 && v < 100) { + q->smap[v] = s; + q->umap[v] = u; + q->omap[v] = o; + } + if (v < max) { + fprintf(samtools_stderr, "Qual calibration file is not in ascending order\n"); + return hclose(fp) ? -2 : -1; + } + max = v; + } + + for (i = max+1; i < 101; i++) { + q->smap[i] = q->smap[max]; + q->umap[i] = q->umap[max]; + q->omap[i] = q->omap[max]; + } + + ks_free(&line); + return hclose(fp) < 0 ? -2 : 0; + + err: + ks_free(&line); + return hclose(fp) < 0 ? -2 : -1; +} + +static void consensus_init(double p_het, double p_indel, double het_scale, + double poly_mul, + qcal_t *qcal, int mode, cons_probs *cp) { + int i; + + // NB: only need to initialise once, but we do here for now for (i = -500; i <= 500; i++) e_tab[i] = exp(i); for (i = -500; i <= 500; i++) @@ -298,43 +689,136 @@ static void consensus_init(double p_het) { for (i = 0; i <= 500; i++) e_log[i] = log(i); - // Heterozygous locations + // EXPERIMENTAL + cp->poly_mul = poly_mul; + + // The priors make very little difference, unless shallow data. + // ACGT* by ACGT* + // So AA=0, CC=6, GG=12, TT=18, **=24 for (i = 0; i < 25; i++) - prior[i] = p_het / 20; - prior[0] = prior[6] = prior[12] = prior[18] = prior[24] = (1-p_het)/5; - - lprior15[0] = log(prior[0]); - lprior15[1] = log(prior[1]*2); - lprior15[2] = log(prior[2]*2); - lprior15[3] = log(prior[3]*2); - lprior15[4] = log(prior[4]*2); - lprior15[5] = log(prior[6]); - lprior15[6] = log(prior[7]*2); - lprior15[7] = log(prior[8]*2); - lprior15[8] = log(prior[9]*2); - lprior15[9] = log(prior[12]); - lprior15[10] = log(prior[13]*2); - lprior15[11] = log(prior[14]*2); - lprior15[12] = log(prior[18]); - lprior15[13] = log(prior[19]*2); - lprior15[14] = log(prior[24]); - - - // Rewrite as new form + cp->prior[i] = p_het / 6; // AC AG AT CG CT GT + + // Flat assumption that it is what we observe, and measure everything else + // as relative to this. + cp->prior[0]=cp->prior[6]=cp->prior[12]=cp->prior[18]=cp->prior[24] = 1; + + // heterozygous deletion + for (i = 4; i < 24; i+=5) + cp->prior[i] = p_indel / 6; // /6 to be scaled vs p_het equivalently + + // heterozygous insertion + for (i = 20; i < 24; i++) + cp->prior[i] = p_indel / 6; + + cp->lprior15[0] = log(cp->prior[0]); + cp->lprior15[1] = log(cp->prior[1]); + cp->lprior15[2] = log(cp->prior[2]); + cp->lprior15[3] = log(cp->prior[3]); + cp->lprior15[4] = log(cp->prior[4]); + cp->lprior15[5] = log(cp->prior[6]); + cp->lprior15[6] = log(cp->prior[7]); + cp->lprior15[7] = log(cp->prior[8]); + cp->lprior15[8] = log(cp->prior[9]); + cp->lprior15[9] = log(cp->prior[12]); + cp->lprior15[10] = log(cp->prior[13]); + cp->lprior15[11] = log(cp->prior[14]); + cp->lprior15[12] = log(cp->prior[18]); + cp->lprior15[13] = log(cp->prior[19]); + cp->lprior15[14] = log(cp->prior[24]); + for (i = 1; i < 101; i++) { - double prob = 1 - pow(10, -i / 10.0); - - // May want to multiply all these by 5 so pMM[i] becomes close - // to -0 for most data. This makes the sums increment very slowly, - // keeping bit precision in the accumulator. - pMM[i] = log(prob/5); - p__[i] = log((1-prob)/20); - p_M[i] = log((exp(pMM[i]) + exp(p__[i]))/2); + double prob = 1 - pow(10, -qcal->smap[i] / 10.0); + + // Or is it that prob is 1-p(subst)-p(overcall)? + cp->pMM[i] = log(prob); + + //cp->p__[i] = log(1-prob); // Big help to PB-CCS SNPs; unless fudged + cp->p__[i] = log((1-prob)/3); // correct? poor on PB-CCS w/o fudge + + // Mixed alleles; just average two likelihoods + cp->p_M[i] = log((exp(cp->pMM[i]) + exp(cp->p__[i]))/2); + + // What does this really mean? Can we simulate this by priors? + // It reduces the likelihood of calling het sites, which is + // maybe compensation for alignment artifacts? I'm unsure, + // but it works (to differing degrees) on both PacBio HiFi and + // Illumina HiSeq. It (obviously) loses true hets, but + // potentially this can be compensated for by tweaking P-het + // (which is entirely in the priors). + // + // Low het_scale reduces false positives by making hets less + // likely to be called. In high depth data we normally have + // enough evidence to call correctly even with low het_scale, + // so it's a good +FN vs --FP tradeoff. However on low depth + // data, het_scale can filter out too many true variants. + // + // TODO: So consider adjusting at the end maybe? + // Also consider never changing calls, but changing their + // confidence, so the data is what produces the call with the + // parameters skewing the quality score distribution. + cp->p_M[i] += log(het_scale); + + if (mode == MODE_BAYES_116) { + // Compatibility with samtools 1.16 + + // This had no differention for indel vs substitution error rates, + // so o(vercall) and u(undercall) are subst(_). + cp->pmm[i] = cp->pMM[i]; + cp->poM[i] = cp->p_M[i]; + cp->pum[i] = cp->p_M[i]; + cp->po_[i] = cp->p__[i]; + cp->poo[i] = cp->p__[i]; + cp->puu[i] = cp->p__[i]; + + } else { + // When observing A C G T; leads to insertion calls + prob = 1 - pow(10, -qcal->omap[i] / 10.0); + // /3 for consistency with ACGT rem as relative likelihoods. + // Otherwise with flat priors we end up calling all shallow data + // as "*", which is illogical. + cp->poo[i] = log((1-prob)/3); + + // Ensure pMM is always more likely. (NB: This shouldn't happen + // now with the addition of the /3 step above.) + if (cp->poo[i] > cp->pMM[i]-.5) + cp->poo[i] = cp->pMM[i]-.5; + + cp->po_[i] = log((exp(cp->poo[i]) + exp(cp->p__[i]))/2); + cp->poM[i] = log((exp(cp->poo[i]) + exp(cp->pMM[i]))/2); + + // Overcalls should never be twice as likely than mismatches. + // Het bases are mix of _M (other) and MM ops (this). + // It's fine for _M to be less likely than oM (more likely + // to be overcalled than miscalled), but it should never + // be stronger when combined with other mixed data. + if (cp->poM[i] > cp->p_M[i]+.5) + cp->poM[i] = cp->p_M[i]+.5; + + // Note --low-MQ and --scale-MQ have a big impact on + // undercall errs. May need to separate these options per + // type, but how? + // Multiple-calls, as with mixed mode? This feels like a cheat + + prob = 1 - pow(10, -qcal->umap[i] / 10.0); + cp->pmm[i] = log(prob); + cp->puu[i] = log((1-prob)/3); + if (cp->puu[i] > cp->pMM[i]-.5) // MM is -ve + cp->puu[i] = cp->pMM[i]-.5; + + cp->pum[i] = log((exp(cp->puu[i]) + exp(cp->pmm[i]))/2); + } } - pMM[0] = pMM[1]; - p__[0] = p__[1]; - p_M[0] = p_M[1]; + cp->pMM[0] = cp->pMM[1]; + cp->p__[0] = cp->p__[1]; + cp->p_M[0] = cp->p_M[1]; + + cp->pmm[0] = cp->pmm[1]; + cp->poo[0] = cp->poo[1]; + cp->po_[0] = cp->po_[1]; + cp->poM[0] = cp->poM[1]; + cp->puu[0] = cp->puu[1]; + cp->pum[0] = cp->pum[1]; } static inline double fast_exp(double y) { @@ -382,6 +866,51 @@ int nins(const bam1_t *b){ return indel; } +/* + * Some machines, including 454 and PacBio, store the quality values in + * homopolymers with the first or last base always being the low quality + * state. This can cause problems when reverse-complementing and aligning, + * especially when we left-justify indels. + * + * Other platforms take the approach of having the middle bases high and + * the low confidence spread evenly to both start and end. This means + * reverse-complementing doesn't introduce any strand bias. + * + * We redistribute qualities within homopolymers in this style to fix + * naive consensus or variant calling algorithms. + */ +void homopoly_qual_fix(bam1_t *b) { + static double ph2err[256] = {0}; + int i; + if (!ph2err[0]) { + for (i = 0; i < 256; i++) + ph2err[i] = pow(10, i/-10.0); + } + uint8_t *seq = bam_get_seq(b); + uint8_t *qual = bam_get_qual(b); + for (i = 0; i < b->core.l_qseq; i++) { + int s = i; // start of homopoly + int base = bam_seqi(seq, i); + while (i+1 < b->core.l_qseq && bam_seqi(seq, i+1) == base) + i++; + // s..i inclusive is now homopolymer + + if (s == i) + continue; + + // Simplest: reverse if end_qual < start_qual + // Next: average outer-most two, then next two, etc + // Best: fully redistribute so start/end lower qual than centre + + // Middle route of averaging outer pairs is sufficient? + int j, k; + for (j = s, k = i; j < k; j++,k--) { + double e = ph2err[qual[j]] + ph2err[qual[k]]; + qual[j] = qual[k] = -fast_log2(e/2)*3.0104+.49; + } + } +} + // Return the local NM figure within halo (+/- HALO) of pos. // This local NM is used as a way to modify MAPQ to get a localised MAPQ // score via an adhoc fashion. @@ -391,11 +920,22 @@ double nm_local(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { return 0; pos -= b->core.pos; if (pos < 0) - return nm[0]; + return nm[0] & ((1<<24)-1); if (pos >= b->core.l_qseq) - return nm[b->core.l_qseq-1]; + return nm[b->core.l_qseq-1] & ((1<<24)-1); + + return (nm[pos] & ((1<<24)-1)) / 10.0; +} - return nm[pos] / 10.0; +int poly_len(const pileup_t *p, const bam1_t *b, hts_pos_t pos) { + int *nm = (int *)p->cd; + if (!nm) + return 0; + pos -= b->core.pos; + if (pos >= 0 && pos < b->core.l_qseq) + return nm[pos] >> 24; + else + return 0; } /* @@ -415,68 +955,91 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { const bam1_t *b = &p->b; int qlen = b->core.l_qseq, i; + if (qlen <= 0) + return 0; int *local_nm = calloc(qlen, sizeof(*local_nm)); if (!local_nm) return -1; p->cd = local_nm; + double poly_adj = opts->homopoly_fix ? opts->homopoly_fix : 1; + if (opts->adj_qual) { -#if 0 - // Tweak by localised quality. - // Quality is reduced by a significant portion of the minimum quality - // in neighbouring bases, on the pretext that if the region is bad, then - // this base is bad even if it claims otherwise. + // Set local_nm based on a function of current qual and the local + // minimum qual within the surrounding window. + // + // Basically if we're in a region of low confidence then we downgrade + // higher qual outliers as they may not be as trustworthy as they + // claim. This may be because the qualities have been assigned to + // the wrong or arbitrary base (very common in homopolymers), or the + // surrounding quality (hence also error likelihood) have lead to + // misalignments and the base may be contributing to the wrong + // pileup column. + // + // The nm_local() function returns these scores and uses it to bias + // the mapping quality, which in turn adjusts base quality. uint8_t *qual = bam_get_qual(b); - const int qhalo = 8; // 2? - int qmin = 50; // effectively caps PacBio qual too + uint8_t *seq = bam_get_seq(b); + const int qhalo = 8; // window size for base qual + int qmin = qual[0]; // min qual within qhalo + const int qhalop = 2;// window size for homopolymer qual + int qminp = qual[0]; // min qual within homopolymer halo + int base = bam_seqi(seq, 0), polyl = 0, polyr = 0; // pos, not len + + // Minimum quality of the initial homopolymer + for (i = 1; i < qlen; i++) { + if (bam_seqi(seq, i) != base) + break; + if (i < qhalop && qminp > qual[i]) + qminp = qual[i]; + } + + // Minimum quality for general bases for (i = 0; i < qlen && i < qhalo; i++) { - local_nm[i] = qual[i]; if (qmin > qual[i]) qmin = qual[i]; } + for (;i < qlen-qhalo; i++) { - //int t = (qual[i]*1 + 3*qmin)/4; // good on 60x - int t = (qual[i] + 5*qmin)/4; // good on 15x - local_nm[i] = t < qual[i] ? t : qual[i]; - if (qmin > qual[i+qhalo]) - qmin = qual[i+qhalo]; - else if (qmin <= qual[i-qhalo]) { + if (opts->homopoly_fix && bam_seqi(seq, i) != base) { + polyl = i; + base = bam_seqi(seq, i); + qminp = qual[i]; int j; - qmin = 50; - for (j = i-qhalo+1; j <= i+qhalo; j++) - if (qmin > qual[j]) - qmin = qual[j]; + for (j = i+1; j < qlen; j++) { + if (bam_seqi(seq, j) != base) + break; + if (i < qhalop && qminp > qual[j]) + qminp = qual[j]; + } + polyr = j-1; + } else { + // CHECK: do we want to have opts->homopoly_fix above, + // so when not in use we don't define pl to non-zero? + // Test on SynDip + polyr = polyl; } - } - for (; i < qlen; i++) { - local_nm[i] = qual[i]; - local_nm[i] = (local_nm[i] + 6*qmin)/4; - } + int pl = polyr-polyl; - for (i = 0; i < qlen; i++) { - qual[i] = local_nm[i]; + // Useful for SNPS, as we're judging the variation in + // length as an indicator for base-misalignment. + // Not so useful for indel calling where the longer the indel + // the less confident we are on the len variation being real. + int t = (opts->mode == MODE_BAYES_116) + ? (qual[i] + 5*qmin)/4 + : qual[i]/3 + (qminp-pl*2)*poly_adj; - // Plus overall rescale. - // Lower becomes lower, very high becomes a little higher. - // Helps deep GIAB, but detrimental elsewhere. (What this really - // indicates is quality calibration differs per data set.) - // It's probably something best accounted for somewhere else. - //qual[i] = qual[i]*qual[i]/40+1; - } - memset(local_nm, 0, qlen * sizeof(*local_nm)); -#else - // Skew local NM by qual vs min-qual delta - uint8_t *qual = bam_get_qual(b); - const int qhalo = 8; // 4 - int qmin = 99; - for (i = 0; i < qlen && i < qhalo; i++) { - if (qmin > qual[i]) - qmin = qual[i]; - } - for (;i < qlen-qhalo; i++) { - int t = (qual[i] + 5*qmin)/4; // good on 15x - local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + local_nm[i] += t < qual[i] ? qual[i]-t : 0; + + // Brute force qminp in polyl to polyr range. + // TODO: optimise this with sliding window + qminp = qual[i]; + int k; + for (k = MAX(polyl,i-qhalop); k <= MIN(polyr,i+qhalop); k++) + if (qminp > qual[k]) + qminp = qual[k]; + if (qmin > qual[i+qhalo]) qmin = qual[i+qhalo]; else if (qmin <= qual[i-qhalo]) { @@ -488,10 +1051,36 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { } } for (; i < qlen; i++) { - int t = (qual[i] + 5*qmin)/4; // good on 15x - local_nm[i] += t < qual[i] ? (qual[i]-t) : 0; + int t = (opts->mode == MODE_BAYES_116) + ? (qual[i] + 5*qmin)/4 + : qual[i]/3 + qminp*poly_adj; + local_nm[i] += t < qual[i] ? qual[i]-t : 0; } -#endif + } + + // Fix e.g. PacBio homopolymer qualities + if (opts->homopoly_fix) + homopoly_qual_fix((bam1_t *)b); + + // local_nm[i] & ((1<<24)-1) is for SNP score adjustment. + // We also put some more basic poly-X len in local_nm[i] >> 24. + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < qlen; i++) { + int base = bam_seqi(seq, i); + int poly = 0, j, k; + for (j = i+1; j < qlen; j++) + if (bam_seqi(seq, j) != base) + break; + //printf("%d x %d\n", base, j-i); + + poly = j-i-1; if (poly > 100) poly = 100; + const int HALO=0; + for (k = i-HALO; k < j+HALO; k++) + if (k >= 0 && k < qlen) + local_nm[k] = ((MAX(poly, local_nm[k]>>24))<<24) + | (local_nm[k] & ((1<<24)-1)); + + i = j-1; } // Adjust local_nm array by the number of edits within @@ -543,7 +1132,7 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { } // substitution - for (i = pos-halo*2 >= 0 ? pos-halo*2 : 0; i < pos-halo; i++) + for (i = pos-halo*2 >= 0 ?pos-halo*2 :0; i < pos-halo && i < qlen; i++) local_nm[i]+=5; for (; i < pos+halo && i < qlen; i++) local_nm[i]+=10; @@ -555,11 +1144,58 @@ int nm_init(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { return 1; } +void nm_free(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) { + free(p->cd); + p->cd = NULL; +} + +#ifdef DO_HDW +/* + * Stirling's formula with a 1/12n correction applied to improve accuracy. + * This seems to hold remarkably true for both low and high numbers too. + */ +double lnfact(double n) { + /* Or Gosper's formula... + * return (n*ln(n) - n + ln(2*M_PI*n + M_PI/3) / 2); + */ + return ((n+0.5)*log(n) - n + log(2*M_PI)/2) + log(1 + 1/(12.0*n)); + /* + log(1 + 1/(288.0*n*n)); */ +} + +/* + * The binomical coefficient (n,k) for n trials with k successes where + * prob(success) = p. + * k n-k + * P (k|n) = n! / (k! (n-k)!) p (1-p) + * p + * + * The coefficient we are returning here is the n! / (k! (n-k)!) bit. + * We compute it using ln(n!) and then exp() the result back to avoid + * excessively large numbers. + */ +double bincoef(int n, double k) { + return exp(lnfact(n) - lnfact(k) - lnfact(n-k)); +} + +/* + * Given p == 0.5 the binomial expansion simplifies a bit, so we have + * a dedicated function for this. + */ +double binprobhalf(int n, double k) { + return bincoef(n, k) * pow(0.5, n); +} + +double lnbinprobhalf(int n, double k) { + // ln(binprobhalf) expanded up and simplified + return lnfact(n) - lnfact(k) - lnfact(n-k) - 0.69315*n; +} +#endif static int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, pileup_t *plp, consensus_opts *opts, - consensus_t *cons, int default_qual) { + consensus_t *cons, int default_qual, + cons_probs *cp) { int i, j; static int init_done =0; static double q2p[101], mqual_pow[256]; @@ -573,8 +1209,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, // if it's rare. // Helps a bit on deep data, especially with K2=3, but detrimental on // shallow and (currently) quite a slow down. - -//#define K2 2 #ifdef K2 int hashN[1<<(K2*4+2)] = {0}; int hash1[1<<2] = {0}; @@ -596,7 +1230,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (!init_done) { init_done = 1; - consensus_init(opts->P_het); for (i = 0; i <= 100; i++) { q2p[i] = pow(10, -i/10.0); @@ -614,6 +1247,9 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, /* Initialise */ int counts[6] = {0}; +#ifdef DO_FRACT + int counts2[2][6] = {{0}}; +#endif /* Accumulate */ @@ -641,6 +1277,9 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, int td = depth; // original depth depth = 0; +#ifdef DO_POLY_DIST + int poly_dist[2][100] = {0}; +#endif for (; plp; plp = plp->next) { pileup_t *p = plp; @@ -662,7 +1301,6 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _; hb = (hb<<2)|base; } - // fprintf(samtools_stderr, "%c: %d %d of %d\t%d %d\n", p->base, hashN[hb], hash1[base1], td, p->qual, p->qual * hashN[hb] / hash1[base1]); #undef _ #endif @@ -690,7 +1328,7 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, // convert from sam base to acgt*n order. base = L[base]; - double MM, __, _M, qe; + double MM, __, _M, oo, oM, o_, uu, um, mm, qe; // Correction for mapping quality. Maybe speed up via lookups? // Cannot nullify mapping quality completely. Lots of (true) @@ -700,7 +1338,8 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (flags & CONS_MQUAL) { int mqual = b->core.qual; if (opts->nm_adjust) { - mqual /= (nm_local(p, b, pos)+1); + //mqual /= (nm_local(p, b, pos)+1); + mqual /= (nm_local(p, b, b->core.pos + p->seq_offset+1)+1); mqual *= 1 + 2*(0.5-(td>30?30:td)/60.0); // depth fudge } @@ -725,32 +1364,71 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, if (qual < 1) qual = 1; - __ = p__[qual]; // neither match - MM = pMM[qual] - __; // both match - _M = p_M[qual] - __; // one allele only (half match) + double poly = poly_len(p, b, b->core.pos + p->seq_offset+1); +#ifdef DO_POLY_DIST + poly_dist[bam_is_rev(b)][MIN(99,(int)poly)]++; +#endif + + // EXPERIMENTAL + // Adjust qual based on homopolymer length. + // Affects different platforms by differing amounts. + // May wish to further separate to qual2 and qual3 for ins and del? + int qual2 = MAX(1, qual-(poly-2)*cp->poly_mul); + + /* MM=match _M=half-match __=mismatch */ + __ = cp->p__[qual]; // neither match + MM = cp->pMM[qual] - __; // both match + _M = cp->p_M[qual] - __; // one allele only (half match) + + /* observation ACGT, but against hypothesis ** or *base */ + oo = cp->poo[qual2] - __; + oM = cp->poM[qual2] - __; + o_ = cp->po_[qual2] - __; + + /* observation * */ + uu = cp->puu[qual2] - __; + um = cp->pum[qual2] - __; + mm = cp->pmm[qual2] - __; if (flags & CONS_DISCREP) { qe = q2p[qual]; sumsC[base] += 1 - qe; } + counts[base]++; +#ifdef DO_FRACT + counts2[bam_is_rev(b)][base]++; +#endif + + // oM should never be higher than _M for actual bases! or... + //printf("base %d@%d MM %f _M %f oM %f\n", base, qual, MM, _M, oM); switch (base) { case 0: // A - S[0] += MM; - S[1] += _M; - S[2] += _M; - S[3] += _M; - S[4] += _M; + S[0] += MM; + S[1] += _M; + S[2] += _M; + S[3] += _M; + S[4] += oM; + S[8] += o_; + S[11] += o_; + S[13] += o_; + S[14] += oo; break; case 1: // C - S[1] += _M; - S[5] += MM; - S[6] += _M; - S[7] += _M; - S[8] += _M; + S[1] += _M; + S[5] += MM; + S[6] += _M; + S[7] += _M; + S[8] += oM; + S[4] += o_; + S[11] += o_; + S[13] += o_; + S[14] += oo; + + //fprintf(samtools_stderr, "%d %f %f %f\n", qual, MM+__, oo+__, MM-oo); break; case 2: // G @@ -758,55 +1436,125 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, S[ 6] += _M; S[ 9] += MM; S[10] += _M; - S[11] += _M; + S[11] += oM; + S[4] += o_; + S[8] += o_; + S[13] += o_; + S[14] += oo; break; case 3: // T - S[ 3] += _M; + S[ 3] += _M; // _m S[ 7] += _M; S[10] += _M; - S[12] += MM; - S[13] += _M; + S[12] += MM; // mm + S[13] += oM; + S[4] += o_; + S[8] += o_; + S[11] += o_; + S[14] += oo; + // S[14] oo break; case 4: // * - S[ 4] += _M; - S[ 8] += _M; - S[11] += _M; - S[13] += _M; - S[14] += MM; + // under under under under agree-no-base + S[0] += uu; S[1 ]+= uu; S[2 ]+= uu; S[3 ]+= uu; S[4 ]+= um; + S[5 ]+= uu; S[6 ]+= uu; S[7 ]+= uu; S[8 ]+= um; + S[9 ]+= uu; S[10]+= uu; S[11]+= um; + S[12]+= uu; S[13]+= um; + S[14]+= mm; break; case 5: /* N => equal weight to all A,C,G,T but not a pad */ - S[ 0] += MM; - S[ 1] += MM; - S[ 2] += MM; - S[ 3] += MM; - S[ 4] += _M; - - S[ 5] += MM; - S[ 6] += MM; - S[ 7] += MM; - S[ 8] += _M; - - S[ 9] += MM; - S[10] += MM; - S[11] += _M; - - S[12] += MM; - S[13] += _M; + S[0] += MM; S[1 ]+= MM; S[2 ]+= MM; S[3 ]+= MM; S[4 ]+= oM; + S[5 ]+= MM; S[6 ]+= MM; S[7 ]+= MM; S[8 ]+= oM; + S[9 ]+= MM; S[10]+= MM; S[11]+= oM; + S[12]+= MM; S[13]+= oM; + S[14]+= oo; break; } depth++; + } + +#ifdef DO_POLY_DIST + // Or compute mean and s.d per strand. + // Then compare likelihood of strands coming from the same distribution? + // eg s.d=0.59 vs mean=3.41 sd=0.54... hmm + // + // Or compare ratio of most frequent to next most frequent, for each + // strand. + + int d1 = 0, d2 = 0; + double nd1 = 0, nd2 = 0; + int k; + for (k = 0; k < 100; k++) { + if (!poly_dist[0][k] && !poly_dist[1][k]) + continue; - if (p->eof && p->cd) { - free(p->cd); - p->cd = NULL; +// fprintf(samtools_stdout, "%ld %d %2d %2d\n", pos, k, poly_dist[0][k], poly_dist[1][k]); + d1 += (k+1)*poly_dist[0][k]; + d2 += (k+1)*poly_dist[1][k]; + nd1 += poly_dist[0][k]; + nd2 += poly_dist[1][k]; + } +// fprintf(samtools_stdout, "Avg = %f / %f %f / %f / %f\n", +// (d1+d2+1)/(nd1+nd2+1.), +// (d1+1)/(nd1+1.), (d2+1)/(nd2+1.), +// (d2+1)/(nd2+1.) - (d1+1)/(nd1+1.), +// ((d2+1)/(nd2+1.) - (d1+1)/(nd1+1.)) / ((d1+d2+1)/(nd1+nd2+1.))); + + // Find the top two frequent lengths + int n1 = 0, n2 = 0, l1 = 0, l2 = 0; + for (k = 0; k < 100; k++) { + int poly12 = poly_dist[0][k]+poly_dist[1][k]; + if (n1 < poly12) { + n2 = n1; l2 = l1; + n1 = poly12; + l1 = k; + } else if (n2 < poly12) { + n2 = poly12; + l2 = k; } } + const double N = 5; + nd1 += 1; + nd2 += 1; + + // l1 is most common length + int pn1p = poly_dist[0][l1]; + int pn1m = poly_dist[1][l1]; + // l2 2nd most common + int pn2p = poly_dist[0][l2]; + int pn2m = poly_dist[1][l2]; + + // ratio if two most common lengths on + + double s1 = (pn1p+N) / (pn2p+N); s1 = s1>1?1/s1:s1; + // ratio if two most common lengths on - + double s2 = (pn1m+N) / (pn2m+N); s2 = s2>1?1/s2:s2; + + // ratio of s1 and s2 to identify strand bias + double sbias = s1 / s2; sbias = sbias>1?1/sbias:sbias; + + if (pn2p+pn2m > 0 && l1 != l2) { +// fprintf(samtools_stdout, "len %d,%d + %d,%d - %d,%d\tbias = %f %f, %f %f\t%ld\n", +// l1, l2, pn1p, pn2p, pn1m, pn2m, +// s1, s2, sbias, sqrt(sbias)-1, pos); + + // adjust score for het indels + // sbias is close to 0 for strong strand bias, and 1 for none + sbias = 10*log(sbias);//+.5); + S[ 4] += sbias; // A* + S[ 8] += sbias; // C* + S[11] += sbias; // G* + S[13] += sbias; // T* + } else { + sbias = 0; + } +#endif + /* We've accumulated stats, so now we speculate on the consensus call */ double shift, max, max_het, norm[15]; int call = 0, het_call = 0; @@ -824,8 +1572,87 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, max = -DBL_MAX; max_het = -DBL_MAX; +#ifdef DO_FRACT + // Filter by --min-depth and --het-fract. + // Also add a slight adjustment for strand bias. + for (j = 0; j < 15; j++) { + if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14) + continue; + + double c1p = counts2[0][map_het[j]%5]; + double c1m = counts2[1][map_het[j]%5]; + double c2p = counts2[0][map_het[j]/5]; + double c2m = counts2[1][map_het[j]/5]; + + double c1 = c1p + c1m; + double c2 = c2p + c2m; + + if (c1 && c2) { + // Slight decrease in confidence if strong strand bias. + const int N = 10; // avoid low sample size problems + double b1 = 1 - (N+MIN(c1p,c1m))/(N+MAX(c1p,c1m)); + double b2 = 1 - (N+MIN(c2p,c2m))/(N+MAX(c2p,c2m)); + if (b1 > 0.5) S[j] -= b1; + if (b2 > 0.5) S[j] -= b2; + + // Fraction based filtering, via --min-depth and --het-fract opts. + c1 += 1e-5; + c2 += 1e-5; + if (c2 > c1) { + double tmp = c2; + c2 = c1; + c1 = tmp; + } + + if (c2 < opts->min_depth) + S[j] -= 100; + if (c2 / (c1+1e-5) <= opts->het_fract) + S[j] -= 100; + } + } +#endif + +#ifdef DO_HDW + /* + * Apply Hardy-Weinberg statistics for heterozygous sites. + * This helps, but it also loses sensitivity a little. + */ for (j = 0; j < 15; j++) { - S[j] += lprior15[j]; + if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14) + continue; + + double c1 = counts[map_het[j]%5]; + double c2 = counts[map_het[j]/5]; + + if (c1 && c2) { + c1 += 1e-5; + c2 += 1e-5; + if (c2 > c1) { + double tmp = c2; + c2 = c1; + c1 = tmp; + } + + // Limit depth for HW as we'll have an allele freq difference, + // even if it's just caused by alignment reference bias. + double c12 = c1+c2; + if (c12 > 20) { + c2 *= 20/(c12); + c12 = 20; + c1 = 20-c2; + } + + // Helps a little, especially reducing FN deletions. + c1+=1; + c2+=1; + c12+=2; + S[j] += lnbinprobhalf(c12, c2) + fast_log2(c12)*0.69+.2; + } + } +#endif + + for (j = 0; j < 15; j++) { + S[j] += cp->lprior15[j]; if (shift < S[j]) shift = S[j]; @@ -914,6 +1741,84 @@ int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth, return 0; } +// If opts->gap5 is MODE_MIXED then we use two different parameter +// sets, favouring cp_p for precision and cp_r for recall. Otherwise it's +// always cp_r only. +// +// When both calls equal, we return the same result. When they differ, +// we adjust qual based on accurate vs recall profiles. +int calculate_consensus_gap5m(hts_pos_t pos, int flags, int depth, + pileup_t *plp, consensus_opts *opts, + consensus_t *cons, int default_qual, + cons_probs *cp_r, cons_probs *cp_p) { + if (opts->mode != MODE_MIXED) + return calculate_consensus_gap5(pos, flags, depth, plp, opts, + cons, default_qual, + opts->mode == MODE_PRECISE + ? cp_p : cp_r); + + // EXPERIMENTAL: mixed mode + consensus_t consP, consR; + // Favours precision + calculate_consensus_gap5(pos, flags, depth, plp, opts, + &consP, default_qual, cp_p); + // Favours recall + calculate_consensus_gap5(pos, flags, depth, plp, opts, + &consR, default_qual, cp_r); + +#define MIN(a,b) ((a)<(b)?(a):(b)) +#define MAX(a,b) ((a)>(b)?(a):(b)) + + // Initial starting point is precise mode + memcpy(cons, &consP, sizeof(consP)); + + if (consP.phred > 0 && consR.phred > 0 && consP.call == consR.call) { + // Both strategies match as HOM + // Boost qual as both in agreement + cons->phred += MIN(20, consR.phred); + + } else if (consP.het_logodd >= 0 && consR.het_logodd >= 0 && + consP.het_call == consR.het_call) { + // Both strategies match as HET + // Boost qual as both in agreement + cons->het_logodd += MIN(20, consR.het_logodd); + + } else if (consP.het_logodd >= 0) { + // Accurate method claims heterozygous, so go with it. + // However sensitive method disagrees, so reduce qual a little. + int q2 = MAX(consR.phred, consR.het_logodd); + cons->het_logodd = MAX(1, (cons->het_logodd - q2/2)); + + } else if (consR.het_logodd >= 70) { + // Accurate is homozygous and consR is het, so we go with it instead + // but at a lower quality value. + // TODO: may wish to check HET is consistent with HOM? Very unlikely + // not to be though. + int q1 = consP.phred; + int q2 = consR.het_logodd; + memcpy(cons, &consR, sizeof(consR)); + cons->het_logodd = MIN(15, MAX((q2-q1*2)/2, 1+q2/(q1+1.0))); + + } else if (consR.het_logodd >= 0) { + // As above, but low quality + int q1 = consP.phred; + int q2 = consR.het_logodd; + memcpy(cons, &consR, sizeof(consR)); + cons->het_logodd = MAX(1,q2 - 0.3*q1) + + 5*(consP.het_call == consR.het_call); + cons->phred = 0; + + } else if (consR.het_logodd < 0) { + // Neither are heterozygous, but differing in phred call (V rare). + // Pick highest qual, after some scaling? + consR.phred = consR.phred / 2; + if (consR.phred > consP.phred) + memcpy(cons, &consR, sizeof(consR)); + cons->phred = MAX(10, cons->phred); + } + + return 0; +} /* -------------------------------------------------------------------------- * Main processing logic @@ -975,12 +1880,12 @@ static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { * standard pileup criteria (eg COG-UK / CLIMB Covid-19 seq project). * * - * call1 / score1 / depth1 is the highest scoring allele. - * call2 / score2 / depth2 is the second highest scoring allele. + * call1 / score1 is the highest scoring allele. + * call2 / score2 is the second highest scoring allele. * * Het_fract: score2/score1 * Call_fract: score1 or score1+score2 over total score - * Min_depth: minimum total depth of utilised bases (depth1+depth2) + * Min_depth: minimum total depth of unfiltered bases (above qual/mqual) * Min_score: minimum total score of utilised bases (score1+score2) * * Eg het_fract 0.66, call_fract 0.75 and min_depth 10. @@ -1001,6 +1906,7 @@ static int readaln2(void *dat, samFile *fp, sam_hdr_t *h, bam1_t *b) { static int calculate_consensus_simple(const pileup_t *plp, consensus_opts *opts, int *qual) { int i, min_qual = opts->min_qual; + int tot_depth = 0; // Map "seqi" nt16 to A,C,G,T compatibility with weights on pure bases. // where seqi is A | (C<<1) | (G<<2) | (T<<3) @@ -1051,6 +1957,7 @@ static int calculate_consensus_simple(const pileup_t *plp, freq[16] ++; score[16]+=8 * (opts->use_qual ? q : 1); } + tot_depth++; } // Total usable depth @@ -1060,19 +1967,15 @@ static int calculate_consensus_simple(const pileup_t *plp, // Best and second best potential calls int call1 = 15, call2 = 15; - int depth1 = 0, depth2 = 0; int score1 = 0, score2 = 0; for (i = 0; i < 5; i++) { int c = 1<= opts->het_fract * score1 && opts->ambig) { used_base |= call2; used_score += score2; - used_depth += depth2; } // N is too shallow, or insufficient proportion of total - if (used_depth < opts->min_depth || + if (tot_depth < opts->min_depth || used_score < opts->call_fract * tscore) { - used_depth = 0; // But note shallow gaps are still called gaps, not N, as // we're still more confident there is no base than it is // A, C, G or T. - used_base = call1 == 16 /*&& depth1 >= call_fract * depth*/ - ? 16 : 0; // * or N + used_base = call1 == 16 ? 16 : 0; // * or N } // Our final call. "?" shouldn't be possible to generate @@ -1104,7 +2003,7 @@ static int calculate_consensus_simple(const pileup_t *plp, "NACMGRSVTWYHKDBN" "*ac?g???t???????"; - //printf("%c %d\n", het[used_base], used_depth); + //printf("%c %d\n", het[used_base], tot_depth); if (qual) *qual = used_base ? 100.0 * used_score / tscore : 0; @@ -1171,10 +2070,11 @@ static int basic_pileup(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } } - if (opts->gap5) { + if (opts->mode != MODE_SIMPLE) { consensus_t cons; - calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual); + calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual, + &cons_prob_recall, &cons_prob_precise); if (cons.het_logodd > 0 && opts->ambig) { cb = "AMRWa" // 5x5 matrix with ACGT* per row / col "MCSYc" @@ -1309,10 +2209,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, } // share this with basic_pileup - if (opts->gap5) { + if (opts->mode != MODE_SIMPLE) { consensus_t cons; - calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0, - depth, p, opts, &cons, opts->default_qual); + calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0, + depth, p, opts, &cons, opts->default_qual, + &cons_prob_recall, &cons_prob_precise); if (cons.het_logodd > 0 && opts->ambig) { cb = "AMRWa" // 5x5 matrix with ACGT* per row / col "MCSYc" @@ -1346,6 +2247,11 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, opts->last_tid = tid; return 0; } + if (opts->mark_ins && nth && cb != '*') { + kputc('_', seq); + kputc('_', qual); + } + // end of share // Append consensus base/qual to seqs @@ -1375,6 +2281,7 @@ static int basic_fasta(void *cd, samFile *fp, sam_hdr_t *h, pileup_t *p, return 0; } + // END OF NEW PILEUP //--------------------------------------------------------------------------- @@ -1393,14 +2300,16 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " Exclude reads with any flag bit set\n"); fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n"); fprintf(fp, " --min-MQ INT Exclude reads with mapping quality below INT [0]\n"); + fprintf(fp, " --min-BQ INT Exclude reads with base quality below INT [0]\n"); fprintf(fp, " --show-del yes/no Whether to show deletion as \"*\" [no]\n"); fprintf(fp, " --show-ins yes/no Whether to show insertions [yes]\n"); + fprintf(fp, " --mark-ins Add '+' before every inserted base/qual [off]\n"); fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n"); fprintf(fp, "\nFor simple consensus mode:\n"); fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n"); fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n"); - fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n"); - fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.5]\n"); + fprintf(fp, " -d, --min-depth INT Minimum depth of INT [2]\n"); + fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.15]\n"); fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n"); fprintf(fp, " -C, --cutoff C Consensus cutoff quality C [10]\n"); fprintf(fp, " --(no-)adj-qual Modify quality with local minima [on]\n"); @@ -1412,6 +2321,18 @@ static void usage_exit(FILE *fp, int exit_status) { fprintf(fp, " --high-MQ INT Cap maximum mapping quality [60]\n"); fprintf(fp, " --P-het FLOAT Probability of heterozygous site[%.1e]\n", P_HET); + fprintf(fp, " --P-indel FLOAT Probability of indel sites[%.1e]\n", + P_INDEL); + fprintf(fp, " --het-scale FLOAT Heterozygous SNP probability multiplier[%.1e]\n", + P_HET_SCALE); + fprintf(fp, " -p, --homopoly-fix Spread low-qual bases to both ends of homopolymers\n"); + fprintf(fp, " --homopoly-score FLOAT\n" + " Qual fraction adjustment for -p option [%g]\n", P_HOMOPOLY); + fprintf(fp, " -t, --qual-calibration FILE / :config (see man page)\n"); + fprintf(fp, " Load quality calibration file\n"); + fprintf(fp, "\n"); + fprintf(fp, " -X, --config STR Use pre-defined configuration set. STR from:\n"); + fprintf(fp, " hiseq, hifi, r10.4_sup, r10.4_dup and ultima\n"); fprintf(fp, "\nGlobal options:\n"); sam_global_opt_help(fp, "-.---@-."); @@ -1423,7 +2344,7 @@ int main_consensus(int argc, char **argv) { consensus_opts opts = { // User options - .gap5 = 1, + .mode = MODE_RECALL, .use_qual = 0, .min_qual = 0, .adj_qual = 1, @@ -1446,10 +2367,15 @@ int main_consensus(int argc, char **argv) { .all_bases = 0, .show_del = 0, .show_ins = 1, + .mark_ins = 0, .incl_flags = 0, .excl_flags = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, .min_mqual = 0, .P_het = P_HET, + .P_indel = P_INDEL, + .het_scale = P_HET_SCALE, + .homopoly_fix = 0, + .homopoly_redux = 0.01, // Internal state .ks_line = {0,0}, @@ -1463,6 +2389,8 @@ int main_consensus(int argc, char **argv) { .last_pos = -1, }; + set_qcal(&opts.qcal, QCAL_FLAT); + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'), @@ -1491,18 +2419,27 @@ int main_consensus(int argc, char **argv) { {"het-only", no_argument, NULL, 6}, {"show-del", required_argument, NULL, 7}, {"show-ins", required_argument, NULL, 8}, + {"mark-ins", no_argument, NULL, 18}, {"output", required_argument, NULL, 'o'}, {"incl-flags", required_argument, NULL, 11}, {"rf", required_argument, NULL, 11}, {"excl-flags", required_argument, NULL, 12}, {"ff", required_argument, NULL, 12}, {"min-MQ", required_argument, NULL, 13}, + {"min-BQ", required_argument, NULL, 16}, {"P-het", required_argument, NULL, 15}, + {"P-indel", required_argument, NULL, 17}, + {"het-scale", required_argument, NULL, 19}, {"mode", required_argument, NULL, 'm'}, + {"homopoly-fix", no_argument, NULL, 'p'}, + {"homopoly-score", required_argument, NULL, 'p'+100}, + {"homopoly-redux", required_argument, NULL, 'p'+200}, + {"qual-calibration", required_argument, NULL, 't'}, + {"config", required_argument, NULL, 'X'}, {NULL, 0, NULL, 0} }; - while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:", + while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:", lopts, NULL)) >= 0) { switch (c) { case 'a': opts.all_bases++; break; @@ -1519,12 +2456,21 @@ int main_consensus(int argc, char **argv) { case 'r': opts.reg = optarg; break; case 'C': opts.cons_cutoff = atoi(optarg); break; case 'A': opts.ambig = 1; break; + case 'p': opts.homopoly_fix = P_HOMOPOLY; break; + case 'p'+100: opts.homopoly_fix = atof(optarg); break; + case 'p'+200: + // EXPERIMENTAL + opts.homopoly_redux = atof(optarg); break; case 1: opts.default_qual = atoi(optarg); break; case 6: opts.het_only = 1; break; case 7: opts.show_del = (*optarg == 'y' || *optarg == 'Y'); break; case 8: opts.show_ins = (*optarg == 'y' || *optarg == 'Y'); break; + case 18: opts.mark_ins = 1; break; case 13: opts.min_mqual = atoi(optarg); break; + case 16: opts.min_qual = atoi(optarg); break; case 15: opts.P_het = atof(optarg); break; + case 17: opts.P_indel = atof(optarg); break; + case 19: opts.het_scale = atof(optarg); break; case 'q'+100: opts.adj_qual = 1; break; case 'q'+101: opts.adj_qual = 0; break; case 'm'+100: opts.nm_adjust = 1; break; @@ -1534,9 +2480,22 @@ int main_consensus(int argc, char **argv) { case 'm': // mode if (strcasecmp(optarg, "simple") == 0) { - opts.gap5 = 0; - } else if (strcasecmp(optarg, "bayesian") == 0) { - opts.gap5 = 1; + opts.mode = MODE_SIMPLE; + } else if (strcasecmp(optarg, "bayesian_m") == 0) { + // EXPERIMENTAL: + // A mixture of modified precise/recall params and a + // blending of the two. Sometimes helps a bit. + opts.mode = MODE_MIXED; + } else if (strcasecmp(optarg, "bayesian_p") == 0) { + // EXPERIMENTAL: + // favours precision + opts.mode = MODE_PRECISE; + } else if (strcasecmp(optarg, "bayesian_r") == 0 || + strcasecmp(optarg, "bayesian") == 0) { + // favours recall; the default + opts.mode = MODE_RECALL; + } else if (strcasecmp(optarg, "bayesian_116") == 0) { + opts.mode = MODE_BAYES_116; } else { fprintf(samtools_stderr, "Unknown mode %s\n", optarg); return 1; @@ -1568,6 +2527,67 @@ int main_consensus(int argc, char **argv) { } break; + case 'X': + if (strcasecmp(optarg, "hifi") == 0) { + set_qcal(&opts.qcal, QCAL_HIFI); + opts.mode = MODE_RECALL; + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.low_mqual = 5; + opts.scale_mqual = 1.5; + opts.het_scale = 0.37; + } else if (strcasecmp(optarg, "hiseq") == 0) { + opts.mode = MODE_RECALL; + set_qcal(&opts.qcal, QCAL_HISEQ); + opts.homopoly_redux = 0.01; + } else if (strcasecmp(optarg, "r10.4_sup") == 0) { + // Same as HiFi params, but ONT calibration table. + // At higher depth, hifi params work well for ONT + // when combined with ONT calibration chart. + // + // At lower depth we gain a bit from increasing homopoly_redux + set_qcal(&opts.qcal, QCAL_ONT_R10_4_SUP); + opts.mode = MODE_RECALL; + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.low_mqual = 5; + opts.scale_mqual = 1.5; + opts.het_scale = 0.37; + + // Also consider, for lower depth: + // opts.homopoly_redux = 1; + // opts.scale_mqual = 1; + // opts.het_scale = 0.45; + } else if (strcasecmp(optarg, "r10.4_dup") == 0) { + // Just a copy of of HiFi for duplex currently until + // we get a good truth set for calibration. + set_qcal(&opts.qcal, QCAL_ONT_R10_4_DUP); + opts.mode = MODE_RECALL; + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.low_mqual = 5; + opts.scale_mqual = 1.5; + opts.het_scale = 0.37; + } else if (strcasecmp(optarg, "ultima") == 0) { + // Very similar to HiFi, but with own calibration table + opts.mode = MODE_RECALL; + set_qcal(&opts.qcal, QCAL_ULTIMA); + opts.homopoly_fix = 0.3; + opts.homopoly_redux = 0.01; + opts.het_scale = 0.37; + opts.scale_mqual = 2; + opts.low_mqual = 10; + } else { + // NB consider defaults that are a mixture of all above. + // Options are all similar for all bar Illumina. + // Unsure what :flat calibration table does to each of + // these though. + fprintf(samtools_stderr, "Unrecognised configuration name: \"%s\"\n", + optarg); + return 1; + } + break; + case 11: if ((opts.incl_flags = bam_str2flag(optarg)) < 0) { print_error("consensus", "could not parse --rf %s", optarg); @@ -1581,6 +2601,15 @@ int main_consensus(int argc, char **argv) { } break; + case 't': // --qual-calibration + if (load_qcal(&opts.qcal, optarg) < 0) { + print_error("consensus", + "failed to load quality calibration '%s'", + optarg); + return -1; + } + break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': @@ -1588,6 +2617,44 @@ int main_consensus(int argc, char **argv) { } } +#if 0 + // Dump out the qcal table. Useful for copying into the code above. + int i; + qcal_t *q = &opts.qcal; + fprintf(samtools_stderr, "{"); + for (i = 0; i < 100; i++) + fprintf(samtools_stderr, "%2d,%s", q->smap[i],(i+1)%10?" ":"\n"); + fprintf(samtools_stderr, "},\n{"); + for (i = 0; i < 100; i++) + fprintf(samtools_stderr, "%2d,%s", q->umap[i],(i+1)%10?" ":"\n"); + fprintf(samtools_stderr, "},\n{"); + for (i = 0; i < 100; i++) + fprintf(samtools_stderr, "%2d,%s", q->omap[i],(i+1)%10?" ":"\n"); + fprintf(samtools_stderr, "}\n"); +#endif + + if (opts.mode != MODE_SIMPLE) { + if (opts.mode == MODE_PRECISE) + // More accuracy / precision, but a significant drop + // in recall. + consensus_init(opts.P_het, opts.P_indel, + 0.3 * opts.het_scale, opts.homopoly_redux, + &opts.qcal, MODE_PRECISE, &cons_prob_precise); + + if (opts.mode == MODE_MIXED) + // Blend these in when running in mixed mode, so we can + // keep sensitivity but have a better joint quality to + // reduce the FP rate. + consensus_init(pow(opts.P_het, 0.7), pow(opts.P_indel, 0.7), + 0.3 * opts.het_scale, opts.homopoly_redux, + &opts.qcal, MODE_PRECISE, &cons_prob_precise); + + // Better recall, at a cost of some accuracy (false positives) + consensus_init(opts.P_het, opts.P_indel, opts.het_scale, + opts.mode == MODE_RECALL ? opts.homopoly_redux : 0.01, + &opts.qcal, MODE_RECALL, &cons_prob_recall); + } + if (argc != optind+1) { if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS); else usage_exit(samtools_stderr, EXIT_FAILURE); @@ -1627,8 +2694,11 @@ int main_consensus(int argc, char **argv) { } if (opts.fmt == PILEUP) { - if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, - basic_pileup, &opts) < 0) + if (pileup_loop(opts.fp, opts.h, readaln2, + opts.mode != MODE_SIMPLE ? nm_init : NULL, + basic_pileup, + opts.mode != MODE_SIMPLE ? nm_free : NULL, + &opts) < 0) goto err; if (opts.all_bases) { @@ -1643,8 +2713,10 @@ int main_consensus(int argc, char **argv) { goto err; } } else { - if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL, + if (pileup_loop(opts.fp, opts.h, readaln2, + opts.mode != MODE_SIMPLE ? nm_init : NULL, basic_fasta, + opts.mode != MODE_SIMPLE ? nm_free : NULL, &opts) < 0) goto err; if (opts.all_bases) { diff --git a/samtools/bam_import.c b/samtools/bam_import.c index 47cb125..14ff0b0 100644 --- a/samtools/bam_import.c +++ b/samtools/bam_import.c @@ -4,7 +4,7 @@ * samtools import a_1.fq a_2.fq * samtools import a_interleaved.fq * - * Copyright (C) 2020 Genome Research Ltd. + * Copyright (C) 2020-2021 Genome Research Ltd. * * Author: James Bonfield */ diff --git a/samtools/bam_import.c.pysam.c b/samtools/bam_import.c.pysam.c index 76b61a4..842ff60 100644 --- a/samtools/bam_import.c.pysam.c +++ b/samtools/bam_import.c.pysam.c @@ -6,7 +6,7 @@ * samtools import a_1.fq a_2.fq * samtools import a_interleaved.fq * - * Copyright (C) 2020 Genome Research Ltd. + * Copyright (C) 2020-2021 Genome Research Ltd. * * Author: James Bonfield */ diff --git a/samtools/bam_markdup.c b/samtools/bam_markdup.c index be9b195..fc333c4 100644 --- a/samtools/bam_markdup.c +++ b/samtools/bam_markdup.c @@ -1,7 +1,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2022 Genome Research Ltd. + Copyright (C) 2017-2023 Genome Research Ltd. Author: Andrew Whitwham @@ -74,6 +74,8 @@ typedef struct { int rgx_t; char *barcode; regex_t *bc_rgx; + int read_groups; + int json; } md_param_t; typedef struct { @@ -82,6 +84,7 @@ typedef struct { int32_t this_ref; int32_t other_ref; int32_t barcode; + int32_t read_group; int8_t single; int8_t leftmost; int8_t orientation; @@ -92,8 +95,10 @@ typedef struct read_queue_s { key_data_t single_key; bam1_t *b; struct read_queue_s *duplicate; + struct read_queue_s *original; hts_pos_t pos; int dup_checked; + int read_group; } read_queue_t; typedef struct { @@ -103,6 +108,7 @@ typedef struct { typedef struct { char *name; char type; + int read_group; } dup_map_t; typedef struct { @@ -116,13 +122,27 @@ typedef struct { int end; } check_t; - typedef struct { check_t *c; size_t size; size_t length; } check_list_t; +typedef struct { + long reading; + long writing; + long excluded; + long duplicate; + long single; + long pair; + long single_dup; + long examined; + long optical; + long single_optical; + long np_duplicate; + long np_opt_duplicate; +} stats_block_t; + static khint32_t do_hash(unsigned char *key, khint32_t len); static khint_t hash_key(key_data_t key) { @@ -130,16 +150,17 @@ static khint_t hash_key(key_data_t key) { khint_t hash; if (key.single) { - unsigned char sig[17]; + unsigned char sig[21]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; memcpy(sig + i, &key.orientation, 1); i += 1; memcpy(sig + i, &key.barcode, 4); i += 4; + memcpy(sig + i, &key.read_group, 4); i += 4; hash = do_hash(sig, i); } else { - unsigned char sig[30]; + unsigned char sig[34]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; @@ -148,6 +169,7 @@ static khint_t hash_key(key_data_t key) { memcpy(sig + i, &key.leftmost, 1); i += 1; memcpy(sig + i, &key.orientation, 1); i += 1; memcpy(sig + i, &key.barcode, 4); i += 4; + memcpy(sig + i, &key.read_group, 4); i += 4; hash = do_hash(sig, i); } @@ -169,6 +191,8 @@ static int key_equal(key_data_t a, key_data_t b) { match = 0; else if (a.barcode != b.barcode) match = 0; + else if (a.read_group != b.read_group) + match = 0; if (!a.single) { if (a.other_coord != b.other_coord) @@ -206,6 +230,7 @@ static int key_equal(key_data_t a, key_data_t b) { KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id +KHASH_MAP_INIT_STR(read_groups, int) // read group lookup /* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */ @@ -235,7 +260,7 @@ static int64_t get_mate_score(bam1_t *b) { if ((data = bam_aux_get(b, "ms"))) { score = bam_aux2i(data); } else { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); return -1; } @@ -265,7 +290,7 @@ static int64_t calc_score(bam1_t *b) read is leftmost of the pair. */ -static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { +static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) { hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; int32_t this_ref, other_ref, barcode = 0; int8_t orientation, left_read; @@ -281,14 +306,14 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * if ((data = bam_aux_get(bam, "MC"))) { if (!(cig = bam_aux2Z(data))) { - fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); + print_error("markdup", "error, MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); return 1; } other_end = unclipped_other_end(bam->core.mpos, cig); other_coord = unclipped_other_start(bam->core.mpos, cig); } else { - fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no MC tag. Please run samtools fixmate on file first.\n"); return 1; } @@ -464,7 +489,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); } } else { barcode = do_hash((unsigned char *)bar, strlen(bar)); @@ -488,7 +513,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname); } } } else { @@ -498,13 +523,13 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * char warn_msg[256]; regerror(result, param->bc_rgx, warn_msg, 256); - fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname); } } } if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { - fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n", *warnings); } @@ -516,6 +541,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * key->leftmost = left_read; key->orientation = orientation; key->barcode = barcode; + key->read_group = rg_num; return 0; } @@ -525,7 +551,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * Uses unclipped start (or end depending on orientation), reference id, and orientation. */ -static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { +static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) { hts_pos_t this_coord; int32_t this_ref, barcode = 0; int8_t orientation; @@ -549,7 +575,7 @@ static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, lon (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); } } else { barcode = do_hash((unsigned char *)bar, strlen(bar)); @@ -573,7 +599,7 @@ static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, lon (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname); } } } else { @@ -583,27 +609,29 @@ static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, lon char warn_msg[256]; regerror(result, param->bc_rgx, warn_msg, 256); - fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname); } } } if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { - fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n", *warnings); } + key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; key->barcode = barcode; + key->read_group = rg_num; } /* Add the duplicate name to a hash if it does not exist. */ -static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) { +static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type, int group) { khiter_t d; int ret; @@ -628,7 +656,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n kh_value(d_hash, d).name = strdup(orig_name); if (kh_value(d_hash, d).name == NULL) { - fprintf(stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n"); + print_error("markdup", "error, unable to allocate memory for duplicate original name.\n"); return 1; } } else { @@ -636,8 +664,9 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n } kh_value(d_hash, d).type = type; + kh_value(d_hash, d).read_group = group; } else { - fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n"); + print_error("markdup", "error, unable to store supplementary duplicates.\n"); free(name); return 1; } @@ -690,7 +719,7 @@ static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_b (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); + print_error("markdup", "warning, cannot decipher read name %s for optical duplicate marking.\n", qname); } return 1; @@ -701,7 +730,7 @@ static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_b (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname); + print_error("markdup", "warning, cannot decipher x coordinate in %s .\n", qname); } return 1; @@ -713,7 +742,7 @@ static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_b (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname); + print_error("markdup", "warning, cannot decipher y coordinate in %s .\n", qname); } return 1; @@ -763,7 +792,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen); + print_error("markdup", "warning, x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen); } return 1; @@ -777,7 +806,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord); + print_error("markdup", "warning, cannot decipher x coordinate in %s (%s).\n", qname, coord); } return 1; @@ -787,7 +816,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen); + print_error("markdup", "warning, y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen); } return 1; @@ -801,7 +830,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord); + print_error("markdup", "warning, cannot decipher y coordinate in %s (%s).\n", qname, coord); } return 1; @@ -921,7 +950,7 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const /* Mark the read as a duplicate and update the duplicate hash (if needed) */ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, - long *optical, long *warn) { + int read_group, long *optical, long *warn) { char dup_type = 0; long incoming_warnings = *warn; @@ -929,7 +958,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam if (param->tag) { if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) { - fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n"); + print_error("markdup", "error, unable to append 'do' tag.\n"); return -1; } } @@ -946,7 +975,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) { - fprintf(stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n", + print_error("markdup", "warning, %ld decipher read name warnings. New warnings will not be reported.\n", *warn); } @@ -958,7 +987,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam original = bam_get_qname(ori); } - if (add_duplicate(dup_hash, dup, original, dup_type)) + if (add_duplicate(dup_hash, dup, original, dup_type, read_group)) return -1; } } @@ -968,18 +997,18 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam /* If the duplicate type has changed to optical then retag and duplicate hash. */ -static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { +static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, stats_block_t *stats) { int ret = 0; if (bam_aux_update_str(b, "dt", 3, "SQ")) { - fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n"); + print_error("markdup", "error, unable to update 'dt' tag.\n"); ret = -1; } if (paired) { - (*optical_pair)++; + stats->optical++; } else { - (*optical_single)++; + stats->single_optical++; } if (param->supp) { @@ -993,7 +1022,7 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash if (d == kh_end(dup_hash)) { // error, name should already be in dup hash - fprintf(stderr, "[markdup] error: duplicate name %s not found in hash.\n", + print_error("markdup", "error, duplicate name %s not found in hash.\n", bam_get_qname(b)); ret = -1; } else { @@ -1011,7 +1040,7 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash Returns 0 on success, >0 on coordinate reading error (program can continue) or <0 on an error (program should not continue. */ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, - check_list_t *list, long *warn, long *optical_single, long *optical_pair) { + check_list_t *list, long *warn, stats_block_t *stats) { int ret = 0, coord_fail = 0; char *ori_name = bam_get_qname(ori->b); @@ -1034,7 +1063,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * list->size *= 2; if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) { - fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n"); + print_error("markdup", "error, Unable to expand optical check list.\n"); return -1; } @@ -1062,13 +1091,13 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * if (old_name) { if (strcmp(old_name, ori_name) != 0) { if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) { - fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n"); + print_error("markdup", "error, unable to update 'do' tag.\n"); ret = -1; break; } } } else { - fprintf(stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); + print_error("markdup", "error, 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); ret = -1; break; } @@ -1093,7 +1122,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * is_opt = optical_duplicate_partial(param, ori_name, t_beg, t_end, x, y, current->b, c, param->opt_dist, warn); if (!c->opt && is_opt) { - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + if (optical_retag(param, dup_hash, current->b, current_paired, stats)) { ret = -1; break; } @@ -1105,7 +1134,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * if (current_paired) { if ((c->mate_score = get_mate_score(current->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); ret = -1; break; } @@ -1119,6 +1148,8 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * if (!ret && coord_fail) ret = coord_fail; + ori->dup_checked = 1; + return ret; } @@ -1133,7 +1164,7 @@ static int xcoord_sort(const void *a, const void *b) { /* Check all the duplicates against each other to see if they are optical duplicates. */ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list, - long *warn, long *optical_single, long *optical_pair) { + long *warn, stats_block_t *stats) { int ret = 0; size_t curr = 0; @@ -1217,7 +1248,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has if (chk_dup) { // the duplicate is the optical duplicate if (!chk->opt) { // only change if not already an optical duplicate - if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) { + if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) { ret = -1; goto fail; } @@ -1226,7 +1257,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has } } else { if (!current->opt) { - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + if (optical_retag(param, dup_hash, current->b, current_paired, stats)) { ret = -1; goto fail; } @@ -1246,53 +1277,24 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has /* Where there is more than one duplicate go down the list and check for optical duplicates and change do tags (where used) to point to original (non-duplicate) read. */ -static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list, - const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single, - long *optical_pair, const int check_range) { +static int find_duplicate_chains(md_param_t *param, read_queue_t *in_read , khash_t(duplicates) *dup_hash, check_list_t *dup_list, + long *warn, stats_block_t *stats) { int ret = 0; - kliter_t(read_queue) *rq; - - rq = kl_begin(read_buffer); - while (rq != kl_end(read_buffer)) { - read_queue_t *in_read = &kl_val(rq); + while (in_read->original) in_read = in_read->original; - if (check_range) { - /* Just check against the moving window of reads based on coordinates and max read length. */ - if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { - break; - } - } else { - // this is the last set of results and the end entry will be blank - if (!bam_get_qname(in_read->b)) { - break; - } + // check against the original for tagging and optical duplication + if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, stats + in_read->read_group))) { + if (ret < 0) { // real error + ret = -1; + } else { // coordinate decoding error + ret = 0; } - - if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain - - // check against the original for tagging and optical duplication - if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) { - if (ret < 0) { // real error - ret = -1; - break; - } else { // coordinate decoding error - ret = 0; - in_read->duplicate = NULL; - continue; - } - } - - // check the rest of the duplicates against each other for optical duplication - if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) { - ret = -1; - break; - } - - in_read->duplicate = NULL; + } else { + // check the rest of the duplicates against each other for optical duplication + if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, stats + in_read->read_group)) { + ret = -1; } - - rq = kl_next(rq); } return ret; @@ -1339,7 +1341,7 @@ static unsigned long estimate_library_size(unsigned long paired_reads, unsigned int i; if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) { - fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n"); + print_error("markdup", "warning, unable to calculate estimated library size.\n"); return estimated_size; } @@ -1362,7 +1364,7 @@ static unsigned long estimate_library_size(unsigned long paired_reads, unsigned estimated_size = (unsigned long)(unique_pairs * (m + M) / 2); } else { - fprintf(stderr, "[markdup] warning: unable to calculate estimated library size." + print_error("markdup", "warning, unable to calculate estimated library size." " Read pairs %ld should be greater than duplicate pairs %ld," " which should both be non zero.\n", non_optical_pairs, duplicate_pairs); @@ -1372,6 +1374,67 @@ static unsigned long estimate_library_size(unsigned long paired_reads, unsigned } +static void write_stats(FILE *fp, const char *title, const char *title_con, stats_block_t *stats) { + unsigned long els; + + els = estimate_library_size(stats->pair, stats->duplicate, stats->optical); + + if (title) { + fprintf(fp, "%s%s\n", title, title_con); + } + + fprintf(fp, + "READ: %ld\n" + "WRITTEN: %ld\n" + "EXCLUDED: %ld\n" + "EXAMINED: %ld\n" + "PAIRED: %ld\n" + "SINGLE: %ld\n" + "DUPLICATE PAIR: %ld\n" + "DUPLICATE SINGLE: %ld\n" + "DUPLICATE PAIR OPTICAL: %ld\n" + "DUPLICATE SINGLE OPTICAL: %ld\n" + "DUPLICATE NON PRIMARY: %ld\n" + "DUPLICATE NON PRIMARY OPTICAL: %ld\n" + "DUPLICATE PRIMARY TOTAL: %ld\n" + "DUPLICATE TOTAL: %ld\n" + "ESTIMATED_LIBRARY_SIZE: %ld\n", stats->reading, stats->writing, stats->excluded, stats->examined, stats->pair, stats->single, + stats->duplicate, stats->single_dup, stats->optical, stats->single_optical, stats->np_duplicate, stats->np_opt_duplicate, + stats->single_dup + stats->duplicate, stats->single_dup + stats->duplicate + stats->np_duplicate, els); +} + + +static void write_json_stats(FILE *fp, const char *offset, const char *group_name, stats_block_t *stats, const char *end) { + unsigned long els; + + els = estimate_library_size(stats->pair, stats->duplicate, stats->optical); + + if (group_name) { + fprintf(fp, "%s\"READ GROUP\": \"%s\",\n", offset, group_name); + } + + fprintf(fp, "%s\"READ\": %ld,\n", offset, stats->reading); + fprintf(fp, "%s\"WRITTEN\": %ld,\n", offset, stats->writing); + fprintf(fp, "%s\"EXCLUDED\": %ld,\n", offset, stats->excluded); + fprintf(fp, "%s\"EXAMINED\": %ld,\n", offset, stats->examined); + fprintf(fp, "%s\"PAIRED\": %ld,\n", offset, stats->pair); + fprintf(fp, "%s\"SINGLE\": %ld,\n", offset, stats->single); + fprintf(fp, "%s\"DUPLICATE PAIR\": %ld,\n", offset, stats->duplicate); + fprintf(fp, "%s\"DUPLICATE SINGLE\": %ld,\n", offset, stats->single_dup); + fprintf(fp, "%s\"DUPLICATE PAIR OPTICAL\": %ld,\n", offset, stats->optical); + fprintf(fp, "%s\"DUPLICATE SINGLE OPTICAL\": %ld,\n", offset, stats->single_optical); + fprintf(fp, "%s\"DUPLICATE NON PRIMARY\": %ld,\n", offset, stats->np_duplicate); + fprintf(fp, "%s\"DUPLICATE NON PRIMARY OPTICAL\": %ld,\n", offset, stats->np_opt_duplicate); + fprintf(fp, "%s\"DUPLICATE PRIMARY TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate); + fprintf(fp, "%s\"DUPLICATE TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate + stats->np_duplicate); + fprintf(fp, "%s\"ESTIMATED_LIBRARY_SIZE\": %ld", offset, els); + + if (end) { + fprintf(fp, "%s", end); + } +} + + /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. Generally the highest quality scoring is chosen as the original and all others the duplicates. The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). @@ -1389,25 +1452,26 @@ static int bam_mark_duplicates(md_param_t *param) { klist_t(read_queue) *read_buffer = kl_init(read_queue); kliter_t(read_queue) *rq; khash_t(duplicates) *dup_hash = kh_init(duplicates); + khash_t(read_groups) *rg_hash = kh_init(read_groups); int32_t prev_tid; hts_pos_t prev_coord; read_queue_t *in_read; int ret; - long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; - long np_duplicate, np_opt_duplicate; + stats_block_t *stats, *stat_array = NULL; + int num_groups = 0; long opt_warnings = 0, bc_warnings = 0; tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; check_list_t dup_list = {NULL, 0, 0}; - if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { - fprintf(stderr, "[markdup] out of memory\n"); + if (!pair_hash || !single_hash || !read_buffer || !dup_hash || !rg_hash) { + print_error("markdup", "error, unable to allocate memory to initialise structures.\n"); goto fail; } if ((header = sam_hdr_read(param->in)) == NULL) { - fprintf(stderr, "[markdup] error reading header\n"); + print_error("markdup", "error reading header\n"); goto fail; } @@ -1415,7 +1479,7 @@ static int bam_mark_duplicates(md_param_t *param) { // only really works on coordinate sorted files. kstring_t str = KS_INITIALIZE; if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) { - fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); + print_error("markdup", "error, queryname sorted, must be sorted by coordinate.\n"); ks_free(&str); goto fail; } @@ -1425,11 +1489,11 @@ static int bam_mark_duplicates(md_param_t *param) { param->arg_list ? "CL" : NULL, param->arg_list ? param->arg_list : NULL, NULL) != 0) { - fprintf(stderr, "[markdup] warning: unable to add @PG line to header.\n"); + print_error("markdup", "warning, unable to add @PG line to header.\n"); } if (sam_hdr_write(param->out, header) < 0) { - fprintf(stderr, "[markdup] error writing header.\n"); + print_error("markdup", "error writing header.\n"); goto fail; } if (param->write_index) { @@ -1437,26 +1501,86 @@ static int bam_mark_duplicates(md_param_t *param) { goto fail; } + if (param->read_groups) { + num_groups = sam_hdr_count_lines(header, "RG"); + int g_ret = 0; + + if (num_groups > 0) { + int i; + + for (i = 0; i < num_groups; i++) { + const char *rg_key; + khiter_t rg; + + rg_key = sam_hdr_line_name(header, "RG", i); + + if (rg_key) { + rg = kh_get(read_groups, rg_hash, rg_key); + + if (rg == kh_end(rg_hash)) { // new entry + rg = kh_put(read_groups, rg_hash, rg_key, &g_ret); + + if (g_ret > 0) { + kh_value(rg_hash, rg) = i + 1; + } else { + print_error("markdup", "error, unable to populate read group ids. " + "Read groups will not be used\n"); + g_ret = -1; + break; + } + } else { + print_error("markdup", "error, duplicate read group ids %s." + "Read groups will not be used\n", rg_key); + g_ret = -1; + break; + } + } else { + print_error("markdup", "error, Unable to retrieve read group at position %d." + "Read groups will not be used\n", i); + g_ret = -1; + break; + } + } + } else { + print_error("markdup", "error, no read groups found.\n"); + g_ret = -1; + } + + if (g_ret < 0) { + print_error("markdup", "error, read groups will not be used.\n"); + param->read_groups = 0; + num_groups = 0; + } + } + + // stat_array[0] will be for ungrouped reads + stat_array = calloc(num_groups + 1, sizeof(stats_block_t)); + + if (stat_array == NULL) { + print_error("markdup", "error, unable to allocate memory for stats.\n"); + goto fail; + } + // used for coordinate order checks prev_tid = prev_coord = 0; // get the buffer going in_read = kl_pushp(read_queue, read_buffer); if (!in_read) { - fprintf(stderr, "[markdup] out of memory\n"); + print_error("markdup", "error, unable to allocate memory to hold reads.\n"); goto fail; } // handling supplementary reads needs a temporary file if (param->supp) { if (tmp_file_open_write(&temp, param->prefix, 1)) { - fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix); + print_error("markdup", "error, unable to open tmp file %s.\n", param->prefix); goto fail; } } if ((in_read->b = bam_init1()) == NULL) { - fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + print_error("markdup", "error, unable to allocate memory for alignment.\n"); goto fail; } @@ -1468,22 +1592,18 @@ static int bam_mark_duplicates(md_param_t *param) { dup_list.c = NULL; if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) { - fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n"); + print_error("markdup", "error, unable to allocate memory for dup_list.\n"); goto fail; } } - reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; - np_duplicate = np_opt_duplicate = 0; - while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { - int dup_checked = 0; // do some basic coordinate order checks if (in_read->b->core.tid >= 0) { // -1 for unmapped reads if (in_read->b->core.tid < prev_tid || ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { - fprintf(stderr, "[markdup] error: not in coordinate sorted order.\n"); + print_error("markdup", "error, not in coordinate sorted order.\n"); goto fail; } } @@ -1493,9 +1613,30 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->pair_key.single = 1; in_read->single_key.single = 0; in_read->duplicate = NULL; + in_read->original = NULL; in_read->dup_checked = 0; + in_read->read_group = 0; - reading++; + if (param->read_groups) { + uint8_t *data; + char *rg; + + if ((data = bam_aux_get(in_read->b, "RG"))) { + if ((rg = bam_aux2Z(data))) { + khiter_t r; + + r = kh_get(read_groups, rg_hash, rg); + + if (r != kh_end(rg_hash)) { + in_read->read_group = kh_value(rg_hash, r); + } + } + } + } + + stats = stat_array + in_read->read_group; + + stats->reading++; if (param->clear && (in_read->b->core.flag & BAM_FDUP)) { uint8_t *data; @@ -1519,7 +1660,7 @@ static int bam_mark_duplicates(md_param_t *param) { // read must not be secondary, supplementary, unmapped or (possibly) failed QC if (!(in_read->b->core.flag & exclude)) { - examined++; + stats->examined++; // look at the pairs first @@ -1529,14 +1670,14 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) { - fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n"); + if (make_pair_key(param, &pair_key, in_read->b, in_read->read_group, &bc_warnings)) { + print_error("markdup", "error, unable to assign pair hash key.\n"); goto fail; } - make_single_key(param, &single_key, in_read->b, &bc_warnings); + make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings); - pair++; + stats->pair++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos // put in singles hash for checking against non paired reads @@ -1556,18 +1697,20 @@ static int bam_mark_duplicates(md_param_t *param) { // scores more than one read of the pair bam1_t *dup = bp->p->b; - if (param->check_chain) + if (param->check_chain) { in_read->duplicate = bp->p; + bp->p->original = in_read; + } bp->p = in_read; - if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; - single_dup++; + stats->single_dup++; } } else { - fprintf(stderr, "[markdup] error: single hashing failure.\n"); + print_error("markdup", "error, single hashing failure for paired read.\n"); goto fail; } @@ -1595,14 +1738,14 @@ static int bam_mark_duplicates(md_param_t *param) { } } else { if ((mate_tmp = get_mate_score(bp->p->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); goto fail; } else { old_score = calc_score(bp->p->b) + mate_tmp; } if ((mate_tmp = get_mate_score(in_read->b)) == -1) { - fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); goto fail; } else { new_score = calc_score(in_read->b) + mate_tmp; @@ -1636,6 +1779,8 @@ static int bam_mark_duplicates(md_param_t *param) { } else { in_read->duplicate = bp->p; } + + bp->p->original = in_read; } bp->p = in_read; @@ -1656,17 +1801,18 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p->duplicate = in_read; + in_read->original = bp->p; } dup = in_read->b; } - if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings)) goto fail; - duplicate++; + stats->duplicate++; } else { - fprintf(stderr, "[markdup] error: pair hashing failure.\n"); + print_error("markdup", "error, pair hashing failure.\n"); goto fail; } } else { // do the single (or effectively single) reads @@ -1674,9 +1820,9 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - make_single_key(param, &single_key, in_read->b, &bc_warnings); + make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings); - single++; + stats->single++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos k = kh_put(reads, single_hash, single_key, &ret); @@ -1697,9 +1843,10 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p->duplicate = in_read; + in_read->original = bp->p; } - if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; } else { @@ -1714,8 +1861,10 @@ static int bam_mark_duplicates(md_param_t *param) { if (new_score > old_score) { // swap reads dup = bp->p->b; - if (param->check_chain) + if (param->check_chain) { in_read->duplicate = bp->p; + bp->p->original = in_read; + } bp->p = in_read; } else { @@ -1725,23 +1874,24 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p->duplicate = in_read; + in_read->original = bp->p; } dup = in_read->b; } - if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; } - single_dup++; + stats->single_dup++; } else { - fprintf(stderr, "[markdup] error: single hashing failure.\n"); + print_error("markdup", "error, single hashing failure for single read.\n"); goto fail; } } } else { - excluded++; + stats->excluded++; } // loop through the stored reads and write out those we @@ -1756,36 +1906,27 @@ static int bam_mark_duplicates(md_param_t *param) { break; } - if (!dup_checked && param->check_chain) { - // check for multiple optical duplicates of the same original read - - if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) { - fprintf(stderr, "[markdup] error: duplicate checking failed.\n"); + if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) { + if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) { + print_error("markdup", "error, duplicate checking failed.\n"); goto fail; } - - dup_checked = 1; - } - - - if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) { - break; } if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { - fprintf(stderr, "[markdup] error: writing temp output failed.\n"); + print_error("markdup", "error, writing temp output failed.\n"); goto fail; } } else { if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(stderr, "[markdup] error: writing output failed.\n"); + print_error("markdup", "error, writing output failed.\n"); goto fail; } } - writing++; + stat_array[in_read->read_group].writing++; } // remove from hash @@ -1807,49 +1948,48 @@ static int bam_mark_duplicates(md_param_t *param) { // set the next one up for reading in_read = kl_pushp(read_queue, read_buffer); if (!in_read) { - fprintf(stderr, "[markdup] out of memory\n"); + print_error("markdup", "error, unable to allocate memory for read in queue.\n"); goto fail; } if ((in_read->b = bam_init1()) == NULL) { - fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + print_error("markdup", "error, unable to allocate memory for alignment.\n"); goto fail; } } if (ret < -1) { - fprintf(stderr, "[markdup] error: truncated input file.\n"); + print_error("markdup", "error, truncated input file.\n"); goto fail; } - // one last check - if (param->tag || param->opt_dist) { - if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) { - fprintf(stderr, "[markdup] error: duplicate checking failed.\n"); - goto fail; - } - } - // write out the end of the list rq = kl_begin(read_buffer); while (rq != kl_end(read_buffer)) { in_read = &kl_val(rq); if (bam_get_qname(in_read->b)) { // last entry will be blank + if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) { + if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) { + print_error("markdup", "error, duplicate checking failed.\n"); + goto fail; + } + } + if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { - fprintf(stderr, "[markdup] error: writing temp output failed.\n"); + print_error("markdup", "error, writing temp output failed on final write.\n"); goto fail; } } else { if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(stderr, "[markdup] error: writing output failed.\n"); + print_error("markdup", "error, writing output failed on final write.\n"); goto fail; } } - writing++; + stat_array[in_read->read_group].writing++; } } @@ -1862,7 +2002,7 @@ static int bam_mark_duplicates(md_param_t *param) { bam1_t *b; if (tmp_file_end_write(&temp)) { - fprintf(stderr, "[markdup] error: unable to end tmp writing.\n"); + print_error("markdup", "error, unable to end tmp writing.\n"); goto fail; } @@ -1883,11 +2023,11 @@ static int bam_mark_duplicates(md_param_t *param) { if (k != kh_end(dup_hash)) { b->core.flag |= BAM_FDUP; - np_duplicate++; + stat_array[kh_val(dup_hash, k).read_group].np_duplicate++; if (param->tag && kh_val(dup_hash, k).name) { if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) { - fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); + print_error("markdup", "error, unable to append supplementary 'do' tag.\n"); goto fail; } } @@ -1895,7 +2035,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->opt_dist) { if (kh_val(dup_hash, k).type) { bam_aux_update_str(b, "dt", 3, "SQ"); - np_opt_duplicate++; + stat_array[kh_val(dup_hash, k).read_group].np_opt_duplicate++; } else { bam_aux_update_str(b, "dt", 3, "LB"); } @@ -1905,14 +2045,14 @@ static int bam_mark_duplicates(md_param_t *param) { if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { if (sam_write1(param->out, header, b) < 0) { - fprintf(stderr, "[markdup] error: writing final output failed.\n"); + print_error("markdup", "error, writing final output failed.\n"); goto fail; } } } if (ret == -1) { - fprintf(stderr, "[markdup] error: failed to read tmp file.\n"); + print_error("markdup", "error, failed to read tmp file.\n"); goto fail; } @@ -1929,22 +2069,23 @@ static int bam_mark_duplicates(md_param_t *param) { } if (opt_warnings) { - fprintf(stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n", + print_error("markdup", "warning, number of failed attempts to get coordinates from read names = %ld\n", opt_warnings); } if (bc_warnings) { - fprintf(stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings); + print_error("markdup", "warning, number of failed attempts to get barcodes = %ld\n", bc_warnings); } if (param->do_stats) { FILE *fp; int file_open = 0; - unsigned long els; + stats_block_t total; + int i; if (param->stats_file) { if (NULL == (fp = fopen(param->stats_file, "w"))) { - fprintf(stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file); + print_error("markdup", "warning, cannot write stats to %s.\n", param->stats_file); fp = stderr; } else { file_open = 1; @@ -1953,27 +2094,75 @@ static int bam_mark_duplicates(md_param_t *param) { fp = stderr; } - els = estimate_library_size(pair, duplicate, optical); - - fprintf(fp, - "COMMAND: %s\n" - "READ: %ld\n" - "WRITTEN: %ld\n" - "EXCLUDED: %ld\n" - "EXAMINED: %ld\n" - "PAIRED: %ld\n" - "SINGLE: %ld\n" - "DUPLICATE PAIR: %ld\n" - "DUPLICATE SINGLE: %ld\n" - "DUPLICATE PAIR OPTICAL: %ld\n" - "DUPLICATE SINGLE OPTICAL: %ld\n" - "DUPLICATE NON PRIMARY: %ld\n" - "DUPLICATE NON PRIMARY OPTICAL: %ld\n" - "DUPLICATE PRIMARY TOTAL: %ld\n" - "DUPLICATE TOTAL: %ld\n" - "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single, - duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate, - single_dup + duplicate, single_dup + duplicate + np_duplicate, els); + total = stat_array[0]; + + if (param->read_groups) { + for (i = 1; i <= num_groups; i++) { + total.reading += stat_array[i].reading; + total.writing += stat_array[i].writing; + total.excluded += stat_array[i].excluded; + total.duplicate += stat_array[i].duplicate; + total.single += stat_array[i].single; + total.pair += stat_array[i].pair; + total.single_dup += stat_array[i].single_dup; + total.examined += stat_array[i].examined; + total.optical += stat_array[i].optical; + total.single_optical += stat_array[i].single_optical; + total.np_duplicate += stat_array[i].np_duplicate; + total.np_opt_duplicate += stat_array[i].np_opt_duplicate; + } + } + + if (!param->json) { + write_stats(fp, "COMMAND: ", param->arg_list, &total); + fprintf(fp, "\n"); + + if (param->read_groups) { + if (stat_array[0].reading) { + write_stats(fp, "READ GROUP: ", "ungrouped", stat_array); + fprintf(fp, "\n"); + } + + for (i = 0; i < num_groups; i++) { + write_stats(fp, "READ GROUP: ", sam_hdr_line_name(header, "RG", i), stat_array + i + 1); + fprintf(fp, "\n"); + } + } + } else { + char space4[] = " "; + char space8[] = " "; + char space12[] = " "; + + fprintf(fp, "{\n"); + fprintf(fp, "%s\"COMMAND\": \"%s\",\n", space4, param->arg_list); + write_json_stats(fp, space4, NULL, &total, param->read_groups ? ",\n" : "\n"); + + if (param->read_groups) { + fprintf(fp, "%s\"READ GROUPS\": [\n", space4); + + if (stat_array[0].reading) { + fprintf(fp, "%s{\n", space8); + write_json_stats(fp, space12, "ungrouped", stat_array, "\n"); + fprintf(fp, "%s},\n", space8); + } + + for (i = 0; i < num_groups; i++) { + fprintf(fp, "%s{\n", space8); + + write_json_stats(fp, space12, sam_hdr_line_name(header, "RG", i), stat_array + i + 1, "\n"); + + if (i < num_groups -1 ) { + fprintf(fp, "%s},\n", space8); + } else { + fprintf(fp, "%s}\n", space8); + } + } + + fprintf(fp, "%s]\n", space4); + } + + fprintf(fp, "}\n"); + } if (file_open) { fclose(fp); @@ -1982,7 +2171,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->write_index) { if (sam_idx_save(param->out) < 0) { - print_error_errno("markdup", "writing index failed"); + print_error_errno("markdup", "error, writing index failed"); goto fail; } } @@ -1990,10 +2179,12 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); kl_destroy(read_queue, read_buffer); kh_destroy(duplicates, dup_hash); + kh_destroy(read_groups, rg_hash); sam_hdr_destroy(header); return 0; @@ -2009,10 +2200,12 @@ static int bam_mark_duplicates(md_param_t *param) { } } kh_destroy(duplicates, dup_hash); + kh_destroy(read_groups, rg_hash); if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); sam_hdr_destroy(header); @@ -2029,6 +2222,7 @@ static int markdup_usage(void) { fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); fprintf(stderr, " -s Report stats.\n"); fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n"); + fprintf(stderr, " --json Output stats in JSON. Also implies -s\n"); fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); fprintf(stderr, " -c Clear previous duplicate settings and tags.\n"); @@ -2045,6 +2239,7 @@ static int markdup_usage(void) { fprintf(stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n"); fprintf(stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n"); fprintf(stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n"); + fprintf(stderr, " --use-read-groups Use the read group tags in duplicate matching.\n"); fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -2059,7 +2254,7 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { int c, ret, bc_name = 0; - char wmode[4] = {'w', 'b', 0, 0}; + char wmode[4] = {'w', 0, 0, 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; kstring_t tmpprefix = {0, 0, NULL}; @@ -2068,7 +2263,7 @@ int bam_markdup(int argc, char **argv) { char *regex = NULL, *bc_regex = NULL; char *regex_order = "txy"; md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL}; + 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2081,6 +2276,8 @@ int bam_markdup(int argc, char **argv) { {"barcode-tag", required_argument, NULL, 1006}, {"barcode-name", no_argument, NULL, 1007}, {"barcode-rgx", required_argument, NULL, 1008}, + {"use-read-groups", no_argument, NULL, 1009}, + {"json", no_argument, NULL, 1010}, {NULL, 0, NULL, 0} }; @@ -2101,12 +2298,12 @@ int bam_markdup(int argc, char **argv) { } else if (strcmp(optarg, "s") == 0) { param.mode = MD_MODE_SEQUENCE; } else { - fprintf(stderr, "[markdup] error: unknown mode '%s'.\n", optarg); + print_error("markdup", "error, unknown mode '%s'.\n", optarg); return markdup_usage(); } break; - case 'u': wmode[2] = '0'; break; + case 'u': wmode[1] = '0'; break; case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; case 1003: param.check_chain = 0; break; @@ -2115,6 +2312,8 @@ int bam_markdup(int argc, char **argv) { case 1006: param.barcode = optarg; break; case 1007: bc_name = 1; break; case 1008: bc_name = 1, bc_regex = optarg; break; + case 1009: param.read_groups = 1; break; + case 1010: param.json = 1; param.do_stats = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -2125,7 +2324,7 @@ int bam_markdup(int argc, char **argv) { return markdup_usage(); if (param.barcode && bc_name) { - fprintf(stderr, "[markdup] Error: cannot specify --barcode-tag and " + print_error("markdup", "error, cannot specify --barcode-tag and " "--barcode-name (or --barcode-rgx) at same time.\n"); return 1; } @@ -2158,12 +2357,12 @@ int bam_markdup(int argc, char **argv) { param.rgx_y = 2; param.rgx_t = 0; } else { - fprintf(stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order); + print_error("markdup", "error, could not recognise regex coordinate order \"%s\".\n", regex_order); return 1; } if ((param.rgx = malloc(sizeof(regex_t))) == NULL) { - fprintf(stderr, "[markdup] error: could not allocate memory for regex.\n"); + print_error("markdup", "error, could not allocate memory for regex.\n"); return 1; } @@ -2171,7 +2370,7 @@ int bam_markdup(int argc, char **argv) { char err_msg[256]; regerror(result, param.rgx, err_msg, 256); - fprintf(stderr, "[markdup] error: regex error \"%s\"\n", err_msg); + print_error("markdup", "error, regex fail \"%s\"\n", err_msg); free(param.rgx); return 1; } @@ -2182,10 +2381,10 @@ int bam_markdup(int argc, char **argv) { /* From Illumina UMI documentation: "The UMI sequence is located in the eighth colon-delimited field of the read name (QNAME)". */ - char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)"; + char *rgx = "[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:([!-?A-~]+)"; if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) { - fprintf(stderr, "[markdup] error: could not allocate memory for barcode regex.\n"); + print_error("markdup", "error, could not allocate memory for barcode regex.\n"); return 1; } @@ -2197,7 +2396,7 @@ int bam_markdup(int argc, char **argv) { char err_msg[256]; regerror(result, param.bc_rgx, err_msg, 256); - fprintf(stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg); + print_error("markdup", "error, barcode regex fail \"%s\"\n", err_msg); free(param.bc_rgx); return 1; } @@ -2206,21 +2405,22 @@ int bam_markdup(int argc, char **argv) { param.in = sam_open_format(argv[optind], "r", &ga.in); if (!param.in) { - print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); + print_error_errno("markdup", "error, failed to open \"%s\" for input", argv[optind]); return 1; } - sam_open_mode(wmode + 1, argv[optind + 1], NULL); + strcat(wmode, "b"); // default if unknown suffix + sam_open_mode(wmode + strlen(wmode)-1, argv[optind + 1], NULL); param.out = sam_open_format(argv[optind + 1], wmode, &ga.out); if (!param.out) { - print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); + print_error_errno("markdup", "error, failed to open \"%s\" for output", argv[optind + 1]); return 1; } if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { - fprintf(stderr, "[markdup] error creating thread pool\n"); + print_error("markdup", "error creating thread pool.\n"); return 1; } @@ -2256,7 +2456,7 @@ int bam_markdup(int argc, char **argv) { sam_close(param.in); if (sam_close(param.out) < 0) { - fprintf(stderr, "[markdup] error closing output file\n"); + print_error("markdup", "error closing output file.\n"); ret = 1; } diff --git a/samtools/bam_markdup.c.pysam.c b/samtools/bam_markdup.c.pysam.c index 3c14d8b..3e3b0b5 100644 --- a/samtools/bam_markdup.c.pysam.c +++ b/samtools/bam_markdup.c.pysam.c @@ -3,7 +3,7 @@ /* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone through fixmates with the mate scoring option on. - Copyright (C) 2017-2022 Genome Research Ltd. + Copyright (C) 2017-2023 Genome Research Ltd. Author: Andrew Whitwham @@ -76,6 +76,8 @@ typedef struct { int rgx_t; char *barcode; regex_t *bc_rgx; + int read_groups; + int json; } md_param_t; typedef struct { @@ -84,6 +86,7 @@ typedef struct { int32_t this_ref; int32_t other_ref; int32_t barcode; + int32_t read_group; int8_t single; int8_t leftmost; int8_t orientation; @@ -94,8 +97,10 @@ typedef struct read_queue_s { key_data_t single_key; bam1_t *b; struct read_queue_s *duplicate; + struct read_queue_s *original; hts_pos_t pos; int dup_checked; + int read_group; } read_queue_t; typedef struct { @@ -105,6 +110,7 @@ typedef struct { typedef struct { char *name; char type; + int read_group; } dup_map_t; typedef struct { @@ -118,13 +124,27 @@ typedef struct { int end; } check_t; - typedef struct { check_t *c; size_t size; size_t length; } check_list_t; +typedef struct { + long reading; + long writing; + long excluded; + long duplicate; + long single; + long pair; + long single_dup; + long examined; + long optical; + long single_optical; + long np_duplicate; + long np_opt_duplicate; +} stats_block_t; + static khint32_t do_hash(unsigned char *key, khint32_t len); static khint_t hash_key(key_data_t key) { @@ -132,16 +152,17 @@ static khint_t hash_key(key_data_t key) { khint_t hash; if (key.single) { - unsigned char sig[17]; + unsigned char sig[21]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; memcpy(sig + i, &key.orientation, 1); i += 1; memcpy(sig + i, &key.barcode, 4); i += 4; + memcpy(sig + i, &key.read_group, 4); i += 4; hash = do_hash(sig, i); } else { - unsigned char sig[30]; + unsigned char sig[34]; memcpy(sig + i, &key.this_ref, 4); i += 4; memcpy(sig + i, &key.this_coord, 8); i += 8; @@ -150,6 +171,7 @@ static khint_t hash_key(key_data_t key) { memcpy(sig + i, &key.leftmost, 1); i += 1; memcpy(sig + i, &key.orientation, 1); i += 1; memcpy(sig + i, &key.barcode, 4); i += 4; + memcpy(sig + i, &key.read_group, 4); i += 4; hash = do_hash(sig, i); } @@ -171,6 +193,8 @@ static int key_equal(key_data_t a, key_data_t b) { match = 0; else if (a.barcode != b.barcode) match = 0; + else if (a.read_group != b.read_group) + match = 0; if (!a.single) { if (a.other_coord != b.other_coord) @@ -208,6 +232,7 @@ static int key_equal(key_data_t a, key_data_t b) { KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id +KHASH_MAP_INIT_STR(read_groups, int) // read group lookup /* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */ @@ -237,7 +262,7 @@ static int64_t get_mate_score(bam1_t *b) { if ((data = bam_aux_get(b, "ms"))) { score = bam_aux2i(data); } else { - fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); return -1; } @@ -267,7 +292,7 @@ static int64_t calc_score(bam1_t *b) read is leftmost of the pair. */ -static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { +static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) { hts_pos_t this_coord, this_end, other_coord, other_end, leftmost; int32_t this_ref, other_ref, barcode = 0; int8_t orientation, left_read; @@ -283,14 +308,14 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * if ((data = bam_aux_get(bam, "MC"))) { if (!(cig = bam_aux2Z(data))) { - fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); + print_error("markdup", "error, MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n"); return 1; } other_end = unclipped_other_end(bam->core.mpos, cig); other_coord = unclipped_other_start(bam->core.mpos, cig); } else { - fprintf(samtools_stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no MC tag. Please run samtools fixmate on file first.\n"); return 1; } @@ -466,7 +491,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); } } else { barcode = do_hash((unsigned char *)bar, strlen(bar)); @@ -490,7 +515,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname); } } } else { @@ -500,13 +525,13 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * char warn_msg[256]; regerror(result, param->bc_rgx, warn_msg, 256); - fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname); } } } if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { - fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n", *warnings); } @@ -518,6 +543,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * key->leftmost = left_read; key->orientation = orientation; key->barcode = barcode; + key->read_group = rg_num; return 0; } @@ -527,7 +553,7 @@ static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long * Uses unclipped start (or end depending on orientation), reference id, and orientation. */ -static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) { +static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) { hts_pos_t this_coord; int32_t this_ref, barcode = 0; int8_t orientation; @@ -551,7 +577,7 @@ static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, lon (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); + print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode); } } else { barcode = do_hash((unsigned char *)bar, strlen(bar)); @@ -575,7 +601,7 @@ static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, lon (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname); + print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname); } } } else { @@ -585,27 +611,29 @@ static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, lon char warn_msg[256]; regerror(result, param->bc_rgx, warn_msg, 256); - fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname); + print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname); } } } if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) { - fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n", + print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n", *warnings); } + key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; key->barcode = barcode; + key->read_group = rg_num; } /* Add the duplicate name to a hash if it does not exist. */ -static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) { +static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type, int group) { khiter_t d; int ret; @@ -630,7 +658,7 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n kh_value(d_hash, d).name = strdup(orig_name); if (kh_value(d_hash, d).name == NULL) { - fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n"); + print_error("markdup", "error, unable to allocate memory for duplicate original name.\n"); return 1; } } else { @@ -638,8 +666,9 @@ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_n } kh_value(d_hash, d).type = type; + kh_value(d_hash, d).read_group = group; } else { - fprintf(samtools_stderr, "[markdup] error: unable to store supplementary duplicates.\n"); + print_error("markdup", "error, unable to store supplementary duplicates.\n"); free(name); return 1; } @@ -692,7 +721,7 @@ static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_b (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname); + print_error("markdup", "warning, cannot decipher read name %s for optical duplicate marking.\n", qname); } return 1; @@ -703,7 +732,7 @@ static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_b (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname); + print_error("markdup", "warning, cannot decipher x coordinate in %s .\n", qname); } return 1; @@ -715,7 +744,7 @@ static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_b (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname); + print_error("markdup", "warning, cannot decipher y coordinate in %s .\n", qname); } return 1; @@ -765,7 +794,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen); + print_error("markdup", "warning, x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen); } return 1; @@ -779,7 +808,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord); + print_error("markdup", "warning, cannot decipher x coordinate in %s (%s).\n", qname, coord); } return 1; @@ -789,7 +818,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen); + print_error("markdup", "warning, y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen); } return 1; @@ -803,7 +832,7 @@ static inline int get_coordinates_regex(md_param_t *param, const char *qname, in (*warnings)++; if (*warnings <= BMD_WARNING_MAX) { - fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord); + print_error("markdup", "warning, cannot decipher y coordinate in %s (%s).\n", qname, coord); } return 1; @@ -923,7 +952,7 @@ static int optical_duplicate_partial(md_param_t *param, const char *name, const /* Mark the read as a duplicate and update the duplicate hash (if needed) */ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup, - long *optical, long *warn) { + int read_group, long *optical, long *warn) { char dup_type = 0; long incoming_warnings = *warn; @@ -931,7 +960,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam if (param->tag) { if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) { - fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n"); + print_error("markdup", "error, unable to append 'do' tag.\n"); return -1; } } @@ -948,7 +977,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam } if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) { - fprintf(samtools_stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n", + print_error("markdup", "warning, %ld decipher read name warnings. New warnings will not be reported.\n", *warn); } @@ -960,7 +989,7 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam original = bam_get_qname(ori); } - if (add_duplicate(dup_hash, dup, original, dup_type)) + if (add_duplicate(dup_hash, dup, original, dup_type, read_group)) return -1; } } @@ -970,18 +999,18 @@ static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam /* If the duplicate type has changed to optical then retag and duplicate hash. */ -static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) { +static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, stats_block_t *stats) { int ret = 0; if (bam_aux_update_str(b, "dt", 3, "SQ")) { - fprintf(samtools_stderr, "[markdup] error: unable to update 'dt' tag.\n"); + print_error("markdup", "error, unable to update 'dt' tag.\n"); ret = -1; } if (paired) { - (*optical_pair)++; + stats->optical++; } else { - (*optical_single)++; + stats->single_optical++; } if (param->supp) { @@ -995,7 +1024,7 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash if (d == kh_end(dup_hash)) { // error, name should already be in dup hash - fprintf(samtools_stderr, "[markdup] error: duplicate name %s not found in hash.\n", + print_error("markdup", "error, duplicate name %s not found in hash.\n", bam_get_qname(b)); ret = -1; } else { @@ -1013,7 +1042,7 @@ static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash Returns 0 on success, >0 on coordinate reading error (program can continue) or <0 on an error (program should not continue. */ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori, - check_list_t *list, long *warn, long *optical_single, long *optical_pair) { + check_list_t *list, long *warn, stats_block_t *stats) { int ret = 0, coord_fail = 0; char *ori_name = bam_get_qname(ori->b); @@ -1036,7 +1065,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * list->size *= 2; if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) { - fprintf(samtools_stderr, "[markdup] error: Unable to expand opt check list.\n"); + print_error("markdup", "error, Unable to expand optical check list.\n"); return -1; } @@ -1064,13 +1093,13 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * if (old_name) { if (strcmp(old_name, ori_name) != 0) { if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) { - fprintf(samtools_stderr, "[markdup] error: unable to update 'do' tag.\n"); + print_error("markdup", "error, unable to update 'do' tag.\n"); ret = -1; break; } } } else { - fprintf(samtools_stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); + print_error("markdup", "error, 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b)); ret = -1; break; } @@ -1095,7 +1124,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * is_opt = optical_duplicate_partial(param, ori_name, t_beg, t_end, x, y, current->b, c, param->opt_dist, warn); if (!c->opt && is_opt) { - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + if (optical_retag(param, dup_hash, current->b, current_paired, stats)) { ret = -1; break; } @@ -1107,7 +1136,7 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * if (current_paired) { if ((c->mate_score = get_mate_score(current->b)) == -1) { - fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); ret = -1; break; } @@ -1121,6 +1150,8 @@ static int check_chain_against_original(md_param_t *param, khash_t(duplicates) * if (!ret && coord_fail) ret = coord_fail; + ori->dup_checked = 1; + return ret; } @@ -1135,7 +1166,7 @@ static int xcoord_sort(const void *a, const void *b) { /* Check all the duplicates against each other to see if they are optical duplicates. */ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list, - long *warn, long *optical_single, long *optical_pair) { + long *warn, stats_block_t *stats) { int ret = 0; size_t curr = 0; @@ -1219,7 +1250,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has if (chk_dup) { // the duplicate is the optical duplicate if (!chk->opt) { // only change if not already an optical duplicate - if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) { + if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) { ret = -1; goto fail; } @@ -1228,7 +1259,7 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has } } else { if (!current->opt) { - if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) { + if (optical_retag(param, dup_hash, current->b, current_paired, stats)) { ret = -1; goto fail; } @@ -1248,53 +1279,24 @@ static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_has /* Where there is more than one duplicate go down the list and check for optical duplicates and change do tags (where used) to point to original (non-duplicate) read. */ -static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list, - const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single, - long *optical_pair, const int check_range) { +static int find_duplicate_chains(md_param_t *param, read_queue_t *in_read , khash_t(duplicates) *dup_hash, check_list_t *dup_list, + long *warn, stats_block_t *stats) { int ret = 0; - kliter_t(read_queue) *rq; - - rq = kl_begin(read_buffer); - while (rq != kl_end(read_buffer)) { - read_queue_t *in_read = &kl_val(rq); + while (in_read->original) in_read = in_read->original; - if (check_range) { - /* Just check against the moving window of reads based on coordinates and max read length. */ - if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) { - break; - } - } else { - // this is the last set of results and the end entry will be blank - if (!bam_get_qname(in_read->b)) { - break; - } + // check against the original for tagging and optical duplication + if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, stats + in_read->read_group))) { + if (ret < 0) { // real error + ret = -1; + } else { // coordinate decoding error + ret = 0; } - - if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain - - // check against the original for tagging and optical duplication - if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) { - if (ret < 0) { // real error - ret = -1; - break; - } else { // coordinate decoding error - ret = 0; - in_read->duplicate = NULL; - continue; - } - } - - // check the rest of the duplicates against each other for optical duplication - if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) { - ret = -1; - break; - } - - in_read->duplicate = NULL; + } else { + // check the rest of the duplicates against each other for optical duplication + if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, stats + in_read->read_group)) { + ret = -1; } - - rq = kl_next(rq); } return ret; @@ -1341,7 +1343,7 @@ static unsigned long estimate_library_size(unsigned long paired_reads, unsigned int i; if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) { - fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n"); + print_error("markdup", "warning, unable to calculate estimated library size.\n"); return estimated_size; } @@ -1364,7 +1366,7 @@ static unsigned long estimate_library_size(unsigned long paired_reads, unsigned estimated_size = (unsigned long)(unique_pairs * (m + M) / 2); } else { - fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size." + print_error("markdup", "warning, unable to calculate estimated library size." " Read pairs %ld should be greater than duplicate pairs %ld," " which should both be non zero.\n", non_optical_pairs, duplicate_pairs); @@ -1374,6 +1376,67 @@ static unsigned long estimate_library_size(unsigned long paired_reads, unsigned } +static void write_stats(FILE *fp, const char *title, const char *title_con, stats_block_t *stats) { + unsigned long els; + + els = estimate_library_size(stats->pair, stats->duplicate, stats->optical); + + if (title) { + fprintf(fp, "%s%s\n", title, title_con); + } + + fprintf(fp, + "READ: %ld\n" + "WRITTEN: %ld\n" + "EXCLUDED: %ld\n" + "EXAMINED: %ld\n" + "PAIRED: %ld\n" + "SINGLE: %ld\n" + "DUPLICATE PAIR: %ld\n" + "DUPLICATE SINGLE: %ld\n" + "DUPLICATE PAIR OPTICAL: %ld\n" + "DUPLICATE SINGLE OPTICAL: %ld\n" + "DUPLICATE NON PRIMARY: %ld\n" + "DUPLICATE NON PRIMARY OPTICAL: %ld\n" + "DUPLICATE PRIMARY TOTAL: %ld\n" + "DUPLICATE TOTAL: %ld\n" + "ESTIMATED_LIBRARY_SIZE: %ld\n", stats->reading, stats->writing, stats->excluded, stats->examined, stats->pair, stats->single, + stats->duplicate, stats->single_dup, stats->optical, stats->single_optical, stats->np_duplicate, stats->np_opt_duplicate, + stats->single_dup + stats->duplicate, stats->single_dup + stats->duplicate + stats->np_duplicate, els); +} + + +static void write_json_stats(FILE *fp, const char *offset, const char *group_name, stats_block_t *stats, const char *end) { + unsigned long els; + + els = estimate_library_size(stats->pair, stats->duplicate, stats->optical); + + if (group_name) { + fprintf(fp, "%s\"READ GROUP\": \"%s\",\n", offset, group_name); + } + + fprintf(fp, "%s\"READ\": %ld,\n", offset, stats->reading); + fprintf(fp, "%s\"WRITTEN\": %ld,\n", offset, stats->writing); + fprintf(fp, "%s\"EXCLUDED\": %ld,\n", offset, stats->excluded); + fprintf(fp, "%s\"EXAMINED\": %ld,\n", offset, stats->examined); + fprintf(fp, "%s\"PAIRED\": %ld,\n", offset, stats->pair); + fprintf(fp, "%s\"SINGLE\": %ld,\n", offset, stats->single); + fprintf(fp, "%s\"DUPLICATE PAIR\": %ld,\n", offset, stats->duplicate); + fprintf(fp, "%s\"DUPLICATE SINGLE\": %ld,\n", offset, stats->single_dup); + fprintf(fp, "%s\"DUPLICATE PAIR OPTICAL\": %ld,\n", offset, stats->optical); + fprintf(fp, "%s\"DUPLICATE SINGLE OPTICAL\": %ld,\n", offset, stats->single_optical); + fprintf(fp, "%s\"DUPLICATE NON PRIMARY\": %ld,\n", offset, stats->np_duplicate); + fprintf(fp, "%s\"DUPLICATE NON PRIMARY OPTICAL\": %ld,\n", offset, stats->np_opt_duplicate); + fprintf(fp, "%s\"DUPLICATE PRIMARY TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate); + fprintf(fp, "%s\"DUPLICATE TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate + stats->np_duplicate); + fprintf(fp, "%s\"ESTIMATED_LIBRARY_SIZE\": %ld", offset, els); + + if (end) { + fprintf(fp, "%s", end); + } +} + + /* Compare the reads near each other (coordinate sorted) and try to spot the duplicates. Generally the highest quality scoring is chosen as the original and all others the duplicates. The score is based on the sum of the quality values (<= 15) of the read and its mate (if any). @@ -1391,25 +1454,26 @@ static int bam_mark_duplicates(md_param_t *param) { klist_t(read_queue) *read_buffer = kl_init(read_queue); kliter_t(read_queue) *rq; khash_t(duplicates) *dup_hash = kh_init(duplicates); + khash_t(read_groups) *rg_hash = kh_init(read_groups); int32_t prev_tid; hts_pos_t prev_coord; read_queue_t *in_read; int ret; - long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical; - long np_duplicate, np_opt_duplicate; + stats_block_t *stats, *stat_array = NULL; + int num_groups = 0; long opt_warnings = 0, bc_warnings = 0; tmp_file_t temp; char *idx_fn = NULL; int exclude = 0; check_list_t dup_list = {NULL, 0, 0}; - if (!pair_hash || !single_hash || !read_buffer || !dup_hash) { - fprintf(samtools_stderr, "[markdup] out of memory\n"); + if (!pair_hash || !single_hash || !read_buffer || !dup_hash || !rg_hash) { + print_error("markdup", "error, unable to allocate memory to initialise structures.\n"); goto fail; } if ((header = sam_hdr_read(param->in)) == NULL) { - fprintf(samtools_stderr, "[markdup] error reading header\n"); + print_error("markdup", "error reading header\n"); goto fail; } @@ -1417,7 +1481,7 @@ static int bam_mark_duplicates(md_param_t *param) { // only really works on coordinate sorted files. kstring_t str = KS_INITIALIZE; if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) { - fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n"); + print_error("markdup", "error, queryname sorted, must be sorted by coordinate.\n"); ks_free(&str); goto fail; } @@ -1427,11 +1491,11 @@ static int bam_mark_duplicates(md_param_t *param) { param->arg_list ? "CL" : NULL, param->arg_list ? param->arg_list : NULL, NULL) != 0) { - fprintf(samtools_stderr, "[markdup] warning: unable to add @PG line to header.\n"); + print_error("markdup", "warning, unable to add @PG line to header.\n"); } if (sam_hdr_write(param->out, header) < 0) { - fprintf(samtools_stderr, "[markdup] error writing header.\n"); + print_error("markdup", "error writing header.\n"); goto fail; } if (param->write_index) { @@ -1439,26 +1503,86 @@ static int bam_mark_duplicates(md_param_t *param) { goto fail; } + if (param->read_groups) { + num_groups = sam_hdr_count_lines(header, "RG"); + int g_ret = 0; + + if (num_groups > 0) { + int i; + + for (i = 0; i < num_groups; i++) { + const char *rg_key; + khiter_t rg; + + rg_key = sam_hdr_line_name(header, "RG", i); + + if (rg_key) { + rg = kh_get(read_groups, rg_hash, rg_key); + + if (rg == kh_end(rg_hash)) { // new entry + rg = kh_put(read_groups, rg_hash, rg_key, &g_ret); + + if (g_ret > 0) { + kh_value(rg_hash, rg) = i + 1; + } else { + print_error("markdup", "error, unable to populate read group ids. " + "Read groups will not be used\n"); + g_ret = -1; + break; + } + } else { + print_error("markdup", "error, duplicate read group ids %s." + "Read groups will not be used\n", rg_key); + g_ret = -1; + break; + } + } else { + print_error("markdup", "error, Unable to retrieve read group at position %d." + "Read groups will not be used\n", i); + g_ret = -1; + break; + } + } + } else { + print_error("markdup", "error, no read groups found.\n"); + g_ret = -1; + } + + if (g_ret < 0) { + print_error("markdup", "error, read groups will not be used.\n"); + param->read_groups = 0; + num_groups = 0; + } + } + + // stat_array[0] will be for ungrouped reads + stat_array = calloc(num_groups + 1, sizeof(stats_block_t)); + + if (stat_array == NULL) { + print_error("markdup", "error, unable to allocate memory for stats.\n"); + goto fail; + } + // used for coordinate order checks prev_tid = prev_coord = 0; // get the buffer going in_read = kl_pushp(read_queue, read_buffer); if (!in_read) { - fprintf(samtools_stderr, "[markdup] out of memory\n"); + print_error("markdup", "error, unable to allocate memory to hold reads.\n"); goto fail; } // handling supplementary reads needs a temporary file if (param->supp) { if (tmp_file_open_write(&temp, param->prefix, 1)) { - fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix); + print_error("markdup", "error, unable to open tmp file %s.\n", param->prefix); goto fail; } } if ((in_read->b = bam_init1()) == NULL) { - fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + print_error("markdup", "error, unable to allocate memory for alignment.\n"); goto fail; } @@ -1470,22 +1594,18 @@ static int bam_mark_duplicates(md_param_t *param) { dup_list.c = NULL; if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) { - fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for dup_list.\n"); + print_error("markdup", "error, unable to allocate memory for dup_list.\n"); goto fail; } } - reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0; - np_duplicate = np_opt_duplicate = 0; - while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) { - int dup_checked = 0; // do some basic coordinate order checks if (in_read->b->core.tid >= 0) { // -1 for unmapped reads if (in_read->b->core.tid < prev_tid || ((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) { - fprintf(samtools_stderr, "[markdup] error: not in coordinate sorted order.\n"); + print_error("markdup", "error, not in coordinate sorted order.\n"); goto fail; } } @@ -1495,9 +1615,30 @@ static int bam_mark_duplicates(md_param_t *param) { in_read->pair_key.single = 1; in_read->single_key.single = 0; in_read->duplicate = NULL; + in_read->original = NULL; in_read->dup_checked = 0; + in_read->read_group = 0; - reading++; + if (param->read_groups) { + uint8_t *data; + char *rg; + + if ((data = bam_aux_get(in_read->b, "RG"))) { + if ((rg = bam_aux2Z(data))) { + khiter_t r; + + r = kh_get(read_groups, rg_hash, rg); + + if (r != kh_end(rg_hash)) { + in_read->read_group = kh_value(rg_hash, r); + } + } + } + } + + stats = stat_array + in_read->read_group; + + stats->reading++; if (param->clear && (in_read->b->core.flag & BAM_FDUP)) { uint8_t *data; @@ -1521,7 +1662,7 @@ static int bam_mark_duplicates(md_param_t *param) { // read must not be secondary, supplementary, unmapped or (possibly) failed QC if (!(in_read->b->core.flag & exclude)) { - examined++; + stats->examined++; // look at the pairs first @@ -1531,14 +1672,14 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) { - fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n"); + if (make_pair_key(param, &pair_key, in_read->b, in_read->read_group, &bc_warnings)) { + print_error("markdup", "error, unable to assign pair hash key.\n"); goto fail; } - make_single_key(param, &single_key, in_read->b, &bc_warnings); + make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings); - pair++; + stats->pair++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos // put in singles hash for checking against non paired reads @@ -1558,18 +1699,20 @@ static int bam_mark_duplicates(md_param_t *param) { // scores more than one read of the pair bam1_t *dup = bp->p->b; - if (param->check_chain) + if (param->check_chain) { in_read->duplicate = bp->p; + bp->p->original = in_read; + } bp->p = in_read; - if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; - single_dup++; + stats->single_dup++; } } else { - fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); + print_error("markdup", "error, single hashing failure for paired read.\n"); goto fail; } @@ -1597,14 +1740,14 @@ static int bam_mark_duplicates(md_param_t *param) { } } else { if ((mate_tmp = get_mate_score(bp->p->b)) == -1) { - fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); goto fail; } else { old_score = calc_score(bp->p->b) + mate_tmp; } if ((mate_tmp = get_mate_score(in_read->b)) == -1) { - fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n"); + print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n"); goto fail; } else { new_score = calc_score(in_read->b) + mate_tmp; @@ -1638,6 +1781,8 @@ static int bam_mark_duplicates(md_param_t *param) { } else { in_read->duplicate = bp->p; } + + bp->p->original = in_read; } bp->p = in_read; @@ -1658,17 +1803,18 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p->duplicate = in_read; + in_read->original = bp->p; } dup = in_read->b; } - if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings)) goto fail; - duplicate++; + stats->duplicate++; } else { - fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n"); + print_error("markdup", "error, pair hashing failure.\n"); goto fail; } } else { // do the single (or effectively single) reads @@ -1676,9 +1822,9 @@ static int bam_mark_duplicates(md_param_t *param) { key_data_t single_key; in_hash_t *bp; - make_single_key(param, &single_key, in_read->b, &bc_warnings); + make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings); - single++; + stats->single++; in_read->pos = single_key.this_coord; // cigar/orientation modified pos k = kh_put(reads, single_hash, single_key, &ret); @@ -1699,9 +1845,10 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p->duplicate = in_read; + in_read->original = bp->p; } - if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; } else { @@ -1716,8 +1863,10 @@ static int bam_mark_duplicates(md_param_t *param) { if (new_score > old_score) { // swap reads dup = bp->p->b; - if (param->check_chain) + if (param->check_chain) { in_read->duplicate = bp->p; + bp->p->original = in_read; + } bp->p = in_read; } else { @@ -1727,23 +1876,24 @@ static int bam_mark_duplicates(md_param_t *param) { } bp->p->duplicate = in_read; + in_read->original = bp->p; } dup = in_read->b; } - if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings)) + if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings)) goto fail; } - single_dup++; + stats->single_dup++; } else { - fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n"); + print_error("markdup", "error, single hashing failure for single read.\n"); goto fail; } } } else { - excluded++; + stats->excluded++; } // loop through the stored reads and write out those we @@ -1758,36 +1908,27 @@ static int bam_mark_duplicates(md_param_t *param) { break; } - if (!dup_checked && param->check_chain) { - // check for multiple optical duplicates of the same original read - - if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) { - fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n"); + if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) { + if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) { + print_error("markdup", "error, duplicate checking failed.\n"); goto fail; } - - dup_checked = 1; - } - - - if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) { - break; } if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { - fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); + print_error("markdup", "error, writing temp output failed.\n"); goto fail; } } else { if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); + print_error("markdup", "error, writing output failed.\n"); goto fail; } } - writing++; + stat_array[in_read->read_group].writing++; } // remove from hash @@ -1809,49 +1950,48 @@ static int bam_mark_duplicates(md_param_t *param) { // set the next one up for reading in_read = kl_pushp(read_queue, read_buffer); if (!in_read) { - fprintf(samtools_stderr, "[markdup] out of memory\n"); + print_error("markdup", "error, unable to allocate memory for read in queue.\n"); goto fail; } if ((in_read->b = bam_init1()) == NULL) { - fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n"); + print_error("markdup", "error, unable to allocate memory for alignment.\n"); goto fail; } } if (ret < -1) { - fprintf(samtools_stderr, "[markdup] error: truncated input file.\n"); + print_error("markdup", "error, truncated input file.\n"); goto fail; } - // one last check - if (param->tag || param->opt_dist) { - if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) { - fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n"); - goto fail; - } - } - // write out the end of the list rq = kl_begin(read_buffer); while (rq != kl_end(read_buffer)) { in_read = &kl_val(rq); if (bam_get_qname(in_read->b)) { // last entry will be blank + if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) { + if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) { + print_error("markdup", "error, duplicate checking failed.\n"); + goto fail; + } + } + if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) { if (param->supp) { if (tmp_file_write(&temp, in_read->b)) { - fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n"); + print_error("markdup", "error, writing temp output failed on final write.\n"); goto fail; } } else { if (sam_write1(param->out, header, in_read->b) < 0) { - fprintf(samtools_stderr, "[markdup] error: writing output failed.\n"); + print_error("markdup", "error, writing output failed on final write.\n"); goto fail; } } - writing++; + stat_array[in_read->read_group].writing++; } } @@ -1864,7 +2004,7 @@ static int bam_mark_duplicates(md_param_t *param) { bam1_t *b; if (tmp_file_end_write(&temp)) { - fprintf(samtools_stderr, "[markdup] error: unable to end tmp writing.\n"); + print_error("markdup", "error, unable to end tmp writing.\n"); goto fail; } @@ -1885,11 +2025,11 @@ static int bam_mark_duplicates(md_param_t *param) { if (k != kh_end(dup_hash)) { b->core.flag |= BAM_FDUP; - np_duplicate++; + stat_array[kh_val(dup_hash, k).read_group].np_duplicate++; if (param->tag && kh_val(dup_hash, k).name) { if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) { - fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n"); + print_error("markdup", "error, unable to append supplementary 'do' tag.\n"); goto fail; } } @@ -1897,7 +2037,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->opt_dist) { if (kh_val(dup_hash, k).type) { bam_aux_update_str(b, "dt", 3, "SQ"); - np_opt_duplicate++; + stat_array[kh_val(dup_hash, k).read_group].np_opt_duplicate++; } else { bam_aux_update_str(b, "dt", 3, "LB"); } @@ -1907,14 +2047,14 @@ static int bam_mark_duplicates(md_param_t *param) { if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) { if (sam_write1(param->out, header, b) < 0) { - fprintf(samtools_stderr, "[markdup] error: writing final output failed.\n"); + print_error("markdup", "error, writing final output failed.\n"); goto fail; } } } if (ret == -1) { - fprintf(samtools_stderr, "[markdup] error: failed to read tmp file.\n"); + print_error("markdup", "error, failed to read tmp file.\n"); goto fail; } @@ -1931,22 +2071,23 @@ static int bam_mark_duplicates(md_param_t *param) { } if (opt_warnings) { - fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n", + print_error("markdup", "warning, number of failed attempts to get coordinates from read names = %ld\n", opt_warnings); } if (bc_warnings) { - fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings); + print_error("markdup", "warning, number of failed attempts to get barcodes = %ld\n", bc_warnings); } if (param->do_stats) { FILE *fp; int file_open = 0; - unsigned long els; + stats_block_t total; + int i; if (param->stats_file) { if (NULL == (fp = fopen(param->stats_file, "w"))) { - fprintf(samtools_stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file); + print_error("markdup", "warning, cannot write stats to %s.\n", param->stats_file); fp = samtools_stderr; } else { file_open = 1; @@ -1955,27 +2096,75 @@ static int bam_mark_duplicates(md_param_t *param) { fp = samtools_stderr; } - els = estimate_library_size(pair, duplicate, optical); - - fprintf(fp, - "COMMAND: %s\n" - "READ: %ld\n" - "WRITTEN: %ld\n" - "EXCLUDED: %ld\n" - "EXAMINED: %ld\n" - "PAIRED: %ld\n" - "SINGLE: %ld\n" - "DUPLICATE PAIR: %ld\n" - "DUPLICATE SINGLE: %ld\n" - "DUPLICATE PAIR OPTICAL: %ld\n" - "DUPLICATE SINGLE OPTICAL: %ld\n" - "DUPLICATE NON PRIMARY: %ld\n" - "DUPLICATE NON PRIMARY OPTICAL: %ld\n" - "DUPLICATE PRIMARY TOTAL: %ld\n" - "DUPLICATE TOTAL: %ld\n" - "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single, - duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate, - single_dup + duplicate, single_dup + duplicate + np_duplicate, els); + total = stat_array[0]; + + if (param->read_groups) { + for (i = 1; i <= num_groups; i++) { + total.reading += stat_array[i].reading; + total.writing += stat_array[i].writing; + total.excluded += stat_array[i].excluded; + total.duplicate += stat_array[i].duplicate; + total.single += stat_array[i].single; + total.pair += stat_array[i].pair; + total.single_dup += stat_array[i].single_dup; + total.examined += stat_array[i].examined; + total.optical += stat_array[i].optical; + total.single_optical += stat_array[i].single_optical; + total.np_duplicate += stat_array[i].np_duplicate; + total.np_opt_duplicate += stat_array[i].np_opt_duplicate; + } + } + + if (!param->json) { + write_stats(fp, "COMMAND: ", param->arg_list, &total); + fprintf(fp, "\n"); + + if (param->read_groups) { + if (stat_array[0].reading) { + write_stats(fp, "READ GROUP: ", "ungrouped", stat_array); + fprintf(fp, "\n"); + } + + for (i = 0; i < num_groups; i++) { + write_stats(fp, "READ GROUP: ", sam_hdr_line_name(header, "RG", i), stat_array + i + 1); + fprintf(fp, "\n"); + } + } + } else { + char space4[] = " "; + char space8[] = " "; + char space12[] = " "; + + fprintf(fp, "{\n"); + fprintf(fp, "%s\"COMMAND\": \"%s\",\n", space4, param->arg_list); + write_json_stats(fp, space4, NULL, &total, param->read_groups ? ",\n" : "\n"); + + if (param->read_groups) { + fprintf(fp, "%s\"READ GROUPS\": [\n", space4); + + if (stat_array[0].reading) { + fprintf(fp, "%s{\n", space8); + write_json_stats(fp, space12, "ungrouped", stat_array, "\n"); + fprintf(fp, "%s},\n", space8); + } + + for (i = 0; i < num_groups; i++) { + fprintf(fp, "%s{\n", space8); + + write_json_stats(fp, space12, sam_hdr_line_name(header, "RG", i), stat_array + i + 1, "\n"); + + if (i < num_groups -1 ) { + fprintf(fp, "%s},\n", space8); + } else { + fprintf(fp, "%s}\n", space8); + } + } + + fprintf(fp, "%s]\n", space4); + } + + fprintf(fp, "}\n"); + } if (file_open) { fclose(fp); @@ -1984,7 +2173,7 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->write_index) { if (sam_idx_save(param->out) < 0) { - print_error_errno("markdup", "writing index failed"); + print_error_errno("markdup", "error, writing index failed"); goto fail; } } @@ -1992,10 +2181,12 @@ static int bam_mark_duplicates(md_param_t *param) { if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); kl_destroy(read_queue, read_buffer); kh_destroy(duplicates, dup_hash); + kh_destroy(read_groups, rg_hash); sam_hdr_destroy(header); return 0; @@ -2011,10 +2202,12 @@ static int bam_mark_duplicates(md_param_t *param) { } } kh_destroy(duplicates, dup_hash); + kh_destroy(read_groups, rg_hash); if (param->check_chain && (param->tag || param->opt_dist)) free(dup_list.c); + free(stat_array); kh_destroy(reads, pair_hash); kh_destroy(reads, single_hash); sam_hdr_destroy(header); @@ -2031,6 +2224,7 @@ static int markdup_usage(void) { fprintf(samtools_stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n"); fprintf(samtools_stderr, " -s Report stats.\n"); fprintf(samtools_stderr, " -f NAME Write stats to named file. Implies -s.\n"); + fprintf(samtools_stderr, " --json Output stats in JSON. Also implies -s\n"); fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n"); fprintf(samtools_stderr, " -d INT Optical distance (if set, marks with dt tag)\n"); fprintf(samtools_stderr, " -c Clear previous duplicate settings and tags.\n"); @@ -2047,6 +2241,7 @@ static int markdup_usage(void) { fprintf(samtools_stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n"); fprintf(samtools_stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n"); fprintf(samtools_stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n"); + fprintf(samtools_stderr, " --use-read-groups Use the read group tags in duplicate matching.\n"); fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag." " Mainly for information and debugging.\n"); @@ -2061,7 +2256,7 @@ static int markdup_usage(void) { int bam_markdup(int argc, char **argv) { int c, ret, bc_name = 0; - char wmode[4] = {'w', 'b', 0, 0}; + char wmode[4] = {'w', 0, 0, 0}; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; htsThreadPool p = {NULL, 0}; kstring_t tmpprefix = {0, 0, NULL}; @@ -2070,7 +2265,7 @@ int bam_markdup(int argc, char **argv) { char *regex = NULL, *bc_regex = NULL; char *regex_order = "txy"; md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL}; + 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0}; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'), @@ -2083,6 +2278,8 @@ int bam_markdup(int argc, char **argv) { {"barcode-tag", required_argument, NULL, 1006}, {"barcode-name", no_argument, NULL, 1007}, {"barcode-rgx", required_argument, NULL, 1008}, + {"use-read-groups", no_argument, NULL, 1009}, + {"json", no_argument, NULL, 1010}, {NULL, 0, NULL, 0} }; @@ -2103,12 +2300,12 @@ int bam_markdup(int argc, char **argv) { } else if (strcmp(optarg, "s") == 0) { param.mode = MD_MODE_SEQUENCE; } else { - fprintf(samtools_stderr, "[markdup] error: unknown mode '%s'.\n", optarg); + print_error("markdup", "error, unknown mode '%s'.\n", optarg); return markdup_usage(); } break; - case 'u': wmode[2] = '0'; break; + case 'u': wmode[1] = '0'; break; case 1001: param.include_fails = 1; break; case 1002: param.no_pg = 1; break; case 1003: param.check_chain = 0; break; @@ -2117,6 +2314,8 @@ int bam_markdup(int argc, char **argv) { case 1006: param.barcode = optarg; break; case 1007: bc_name = 1; break; case 1008: bc_name = 1, bc_regex = optarg; break; + case 1009: param.read_groups = 1; break; + case 1010: param.json = 1; param.do_stats = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return markdup_usage(); @@ -2127,7 +2326,7 @@ int bam_markdup(int argc, char **argv) { return markdup_usage(); if (param.barcode && bc_name) { - fprintf(samtools_stderr, "[markdup] Error: cannot specify --barcode-tag and " + print_error("markdup", "error, cannot specify --barcode-tag and " "--barcode-name (or --barcode-rgx) at same time.\n"); return 1; } @@ -2160,12 +2359,12 @@ int bam_markdup(int argc, char **argv) { param.rgx_y = 2; param.rgx_t = 0; } else { - fprintf(samtools_stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order); + print_error("markdup", "error, could not recognise regex coordinate order \"%s\".\n", regex_order); return 1; } if ((param.rgx = malloc(sizeof(regex_t))) == NULL) { - fprintf(samtools_stderr, "[markdup] error: could not allocate memory for regex.\n"); + print_error("markdup", "error, could not allocate memory for regex.\n"); return 1; } @@ -2173,7 +2372,7 @@ int bam_markdup(int argc, char **argv) { char err_msg[256]; regerror(result, param.rgx, err_msg, 256); - fprintf(samtools_stderr, "[markdup] error: regex error \"%s\"\n", err_msg); + print_error("markdup", "error, regex fail \"%s\"\n", err_msg); free(param.rgx); return 1; } @@ -2184,10 +2383,10 @@ int bam_markdup(int argc, char **argv) { /* From Illumina UMI documentation: "The UMI sequence is located in the eighth colon-delimited field of the read name (QNAME)". */ - char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)"; + char *rgx = "[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:([!-?A-~]+)"; if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) { - fprintf(samtools_stderr, "[markdup] error: could not allocate memory for barcode regex.\n"); + print_error("markdup", "error, could not allocate memory for barcode regex.\n"); return 1; } @@ -2199,7 +2398,7 @@ int bam_markdup(int argc, char **argv) { char err_msg[256]; regerror(result, param.bc_rgx, err_msg, 256); - fprintf(samtools_stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg); + print_error("markdup", "error, barcode regex fail \"%s\"\n", err_msg); free(param.bc_rgx); return 1; } @@ -2208,21 +2407,22 @@ int bam_markdup(int argc, char **argv) { param.in = sam_open_format(argv[optind], "r", &ga.in); if (!param.in) { - print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]); + print_error_errno("markdup", "error, failed to open \"%s\" for input", argv[optind]); return 1; } - sam_open_mode(wmode + 1, argv[optind + 1], NULL); + strcat(wmode, "b"); // default if unknown suffix + sam_open_mode(wmode + strlen(wmode)-1, argv[optind + 1], NULL); param.out = sam_open_format(argv[optind + 1], wmode, &ga.out); if (!param.out) { - print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]); + print_error_errno("markdup", "error, failed to open \"%s\" for output", argv[optind + 1]); return 1; } if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { - fprintf(samtools_stderr, "[markdup] error creating thread pool\n"); + print_error("markdup", "error creating thread pool.\n"); return 1; } @@ -2258,7 +2458,7 @@ int bam_markdup(int argc, char **argv) { sam_close(param.in); if (sam_close(param.out) < 0) { - fprintf(samtools_stderr, "[markdup] error closing output file\n"); + print_error("markdup", "error closing output file.\n"); ret = 1; } diff --git a/samtools/bam_mate.c b/samtools/bam_mate.c index 4239fd1..1f2b576 100644 --- a/samtools/bam_mate.c +++ b/samtools/bam_mate.c @@ -1,6 +1,6 @@ /* bam_mate.c -- fix mate pairing information and clean up flags. - Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd. + Copyright (C) 2009, 2011-2017, 2019, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. @@ -253,8 +253,230 @@ static int add_mate_score(bam1_t *src, bam1_t *dest) return 0; } +// Completely delete the CIGAR field +static void clear_cigar(bam1_t *b) { + memmove(bam_get_cigar(b), bam_get_seq(b), + b->data + b->l_data - bam_get_seq(b)); + b->l_data -= 4*b->core.n_cigar; + b->core.n_cigar = 0; +} + +// Trim a CIGAR field to end on reference position "end". Remaining bases +// are turned to soft clips. +static int bam_trim(bam1_t *b, hts_pos_t end) { + hts_pos_t pos = b->core.pos; + int n_cigar = b->core.n_cigar, i; + uint32_t new_cigar_a[1024]; + uint32_t *new_cigar = new_cigar_a; + uint32_t *cigar = bam_get_cigar(b); + + // Find end of alignment or end of ref + int op = 0, oplen = 0; + for (i = 0; i < n_cigar; i++) { + op = bam_cigar_op(cigar[i]); + oplen = bam_cigar_oplen(cigar[i]); + if (!(bam_cigar_type(op) & 2)) + continue; + pos += oplen; + if (pos > end) + break; + } + + if (i == n_cigar) + // looks fine already + return 0; + + int old_i = i, j = 0; + // At worst we grow by 1 element (eg 100M -> 70M30S) + if (n_cigar-i >= 1024-1) { + new_cigar = malloc(4*(n_cigar-i+1)); + if (!new_cigar) + return -1; + } + + // We fill out to new_cigar from here on. + if (pos-oplen < end) { + // Partial CIGAR op? Split existing tag. + cigar[old_i++] = bam_cigar_gen(end - (pos-oplen), op); + new_cigar[j++] = bam_cigar_gen(pos-end, BAM_CSOFT_CLIP); + } else if (pos > end) { + // entirely off the chromosome; this will trigger CIGAR *, MQUAL 0 + b->core.flag |= BAM_FUNMAP; + b->core.flag &= ~BAM_FPROPER_PAIR; + } else { + // CIGAR op started on the trim junction + new_cigar[j++] = bam_cigar_gen(oplen, BAM_CSOFT_CLIP); + } + + // Replace trailing elements. + for (i++; i < n_cigar; i++) { + op = bam_cigar_op(cigar[i]); + oplen = bam_cigar_oplen(cigar[i]); + if (op == BAM_CHARD_CLIP) { + new_cigar[j++] = cigar[i]; + } else { + new_cigar[j-1] = + bam_cigar_gen(bam_cigar_oplen(new_cigar[j-1]) + oplen, + BAM_CSOFT_CLIP); + } + } + + // We now have cigar[0..old_i-1] for existing CIGAR + // and new_cigar[0..j-1] for new CIGAR trailing component. + + if (old_i+j == n_cigar) { + // Fits and no data move needed + memcpy(&cigar[old_i], new_cigar, j*4); + } else { + uint8_t *seq_old = bam_get_seq(b); + uint8_t *aux_end = b->data + b->l_data; + int nshift; + if (old_i+j < n_cigar) { + // Smaller, and can move data down + nshift = -4*(n_cigar - (old_i+j)); + } else { + // Bigger, so grow BAM and move data up + nshift = 4*(old_i+j - n_cigar); + // FIXME: make htslib's sam_realloc_bam_data public + if (b->l_data + nshift > b->m_data) { + uint8_t *new_data = realloc(b->data, b->l_data + nshift); + if (!new_data) { + if (new_cigar != new_cigar_a) + free(new_cigar); + return -1; + } + b->m_data = b->l_data + nshift; + if (b->data != new_data) { + b->data = new_data; + seq_old = bam_get_seq(b); + aux_end = b->data + b->l_data; + cigar = bam_get_cigar(b); + } + } + } + memmove(seq_old+nshift, seq_old, aux_end - seq_old); + b->l_data += nshift; + memcpy(&cigar[old_i], new_cigar, j*4); + b->core.n_cigar = old_i+j; + } + + if (new_cigar != new_cigar_a) + free(new_cigar); + + return 0; +} + +// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux" +// keywords for the bam sanitizer. +int bam_sanitize_options(const char *str) { + int opt = 0; + + while (str && *str) { + const char *str_start; + while(*str && *str == ',') + str++; + + for (str_start = str; *str && *str != ','; str++); + int len = str - str_start; + if (strncmp(str_start, "all", 3) == 0 || *str_start == '*') + opt = FIX_ALL; + else if (strncmp(str_start, "none", 4) == 0 || + strncmp(str_start, "off", 3) == 0) + opt = 0; + else if (strncmp(str_start, "on", 2) == 0) + // default for position sorted data + opt = FIX_MQUAL | FIX_UNMAP | FIX_CIGAR | FIX_AUX; + else if (strncmp(str_start, "pos", 3) == 0) + opt |= FIX_POS; + else if (strncmp(str_start, "mqual", 5) == 0) + opt |= FIX_MQUAL; + else if (strncmp(str_start, "unmap", 5) == 0) + opt |= FIX_UNMAP; + else if (strncmp(str_start, "cigar", 5) == 0) + opt |= FIX_CIGAR; + else if (strncmp(str_start, "aux", 3) == 0) + opt |= FIX_AUX; + else { + print_error("sanitize", "Unrecognised keyword %.*s\n", + len, str_start); + return -1; + } + } + + return opt; +} + +int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) { + if ((flags & FIX_POS) && b->core.tid < 0) { + // RNAME * => pos 0. NB can break alignment chr/pos sort order + b->core.pos = -1; + if (flags & FIX_UNMAP) + b->core.flag |= BAM_FUNMAP; + } + + if ((flags & FIX_CIGAR) && !(b->core.flag & BAM_FUNMAP)) { + // Mapped => unmapped correction + if (b->core.pos < 0 && (flags & FIX_UNMAP)) { + b->core.flag |= BAM_FUNMAP; + } else { + hts_pos_t cur_end, rlen = sam_hdr_tid2len(h, b->core.tid); + if (b->core.pos >= rlen && (flags & FIX_UNMAP)) { + b->core.flag |= BAM_FUNMAP; + if (flags & FIX_POS) + b->core.tid = b->core.pos = -1; + } else if ((cur_end = bam_endpos(b)) > rlen) { + if (bam_trim(b, rlen) < 0) + return -1; + } + } + } + + if (b->core.flag & BAM_FUNMAP) { + // Unmapped -> cigar/qual correctoins + if ((flags & FIX_CIGAR) && b->core.n_cigar > 0) + clear_cigar(b); + + if (flags & FIX_MQUAL) + b->core.qual = 0; + + // Remove NM, MD, CG, SM tags. + if (flags & FIX_AUX) { + uint8_t *from = bam_aux_first(b); + uint8_t *end = b->data + b->l_data; + uint8_t *to = from ? from-2 : end; + +#define XTAG(a) (((a)[0]<<8) + (a)[1]) + while (from) { + uint8_t *next = bam_aux_next(b, from); + if (!next && errno != ENOENT) + return -1; + + // Keep tag unless one of a specific set. + // NB "to" always points to an aux tag start, while + // "from" is after key. + from -= 2; + int key = (int)from[0]<<8 | from[1]; + if (key != XTAG("NM") && key != XTAG("MD") && + key != XTAG("CG") && key != XTAG("SM")) { + ptrdiff_t len = (next ? next-2 : end) - from; + if (from != to) + memmove(to, from, len); + to += len; + } + from = next; + } + b->l_data = to - b->data; + } + } + + return 0; +} + // currently, this function ONLY works if each read has one hit -static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg) +static int bam_mating_core(samFile *in, samFile *out, int remove_reads, + int proper_pair_check, int add_ct, + int do_mate_scoring, char *arg_list, int no_pg, + int sanitize_flags) { sam_hdr_t *header; bam1_t *b[2] = { NULL, NULL }; @@ -289,6 +511,8 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop curr = 0; has_prev = 0; while ((result = sam_read1(in, header, b[curr])) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; + if (bam_sanitize(header, cur, sanitize_flags) < 0) + goto fail; if (cur->core.flag & BAM_FSECONDARY) { if ( !remove_reads ) { @@ -301,17 +525,11 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop if (sam_write1(out, header, cur) < 0) goto write_fail; continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } - if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag - { - cur->core.flag |= BAM_FUNMAP; - } if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end { cur_end = bam_endpos(cur); - - // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag - if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP; } + if (has_prev) { // do we have a pair of reads to examine? if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; @@ -357,11 +575,6 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop } has_prev = 0; } else { // unpaired? clear bad info and write it out - if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped - pre->core.flag |= BAM_FUNMAP; - pre->core.tid = -1; - pre->core.pos = -1; - } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) { @@ -415,6 +628,8 @@ void usage(FILE* where) " -c Add template cigar ct tag\n" " -m Add mate score tag\n" " -u Uncompressed output\n" +" -z, --sanitize FLAG[,FLAG]\n" +" Sanitize alignment fields [defaults to all types]\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(where, "-.O..@-."); @@ -430,7 +645,8 @@ int bam_mating(int argc, char *argv[]) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; - int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; + int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, + mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[4] = {'w', 'b', 0, 0}; static const struct option lopts[] = { @@ -442,17 +658,21 @@ int bam_mating(int argc, char *argv[]) // parse args if (argc == 1) { usage(stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcmO:@:uz:", lopts, NULL)) >= 0) { switch (c) { - case 'r': remove_reads = 1; break; - case 'p': proper_pair_check = 0; break; - case 'c': add_ct = 1; break; - case 'm': mate_score = 1; break; - case 'u': wmode[2] = '0'; break; - case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': usage(stderr); goto fail; + case 'r': remove_reads = 1; break; + case 'p': proper_pair_check = 0; break; + case 'c': add_ct = 1; break; + case 'm': mate_score = 1; break; + case 'u': wmode[2] = '0'; break; + case 1: no_pg = 1; break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage(stderr); goto fail; + case 'z': + if ((sanitize_flags = bam_sanitize_options(optarg)) < 0) + exit(1); + break; } } if (optind+1 >= argc) { usage(stderr); goto fail; } @@ -481,7 +701,8 @@ int bam_mating(int argc, char *argv[]) } // run - res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg); + res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, + mate_score, arg_list, no_pg, sanitize_flags); // cleanup sam_close(in); diff --git a/samtools/bam_mate.c.pysam.c b/samtools/bam_mate.c.pysam.c index 0aa83ec..1796f6e 100644 --- a/samtools/bam_mate.c.pysam.c +++ b/samtools/bam_mate.c.pysam.c @@ -2,7 +2,7 @@ /* bam_mate.c -- fix mate pairing information and clean up flags. - Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd. + Copyright (C) 2009, 2011-2017, 2019, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Portions copyright (C) 2012 Peter Cock, The James Hutton Institute. @@ -255,8 +255,230 @@ static int add_mate_score(bam1_t *src, bam1_t *dest) return 0; } +// Completely delete the CIGAR field +static void clear_cigar(bam1_t *b) { + memmove(bam_get_cigar(b), bam_get_seq(b), + b->data + b->l_data - bam_get_seq(b)); + b->l_data -= 4*b->core.n_cigar; + b->core.n_cigar = 0; +} + +// Trim a CIGAR field to end on reference position "end". Remaining bases +// are turned to soft clips. +static int bam_trim(bam1_t *b, hts_pos_t end) { + hts_pos_t pos = b->core.pos; + int n_cigar = b->core.n_cigar, i; + uint32_t new_cigar_a[1024]; + uint32_t *new_cigar = new_cigar_a; + uint32_t *cigar = bam_get_cigar(b); + + // Find end of alignment or end of ref + int op = 0, oplen = 0; + for (i = 0; i < n_cigar; i++) { + op = bam_cigar_op(cigar[i]); + oplen = bam_cigar_oplen(cigar[i]); + if (!(bam_cigar_type(op) & 2)) + continue; + pos += oplen; + if (pos > end) + break; + } + + if (i == n_cigar) + // looks fine already + return 0; + + int old_i = i, j = 0; + // At worst we grow by 1 element (eg 100M -> 70M30S) + if (n_cigar-i >= 1024-1) { + new_cigar = malloc(4*(n_cigar-i+1)); + if (!new_cigar) + return -1; + } + + // We fill out to new_cigar from here on. + if (pos-oplen < end) { + // Partial CIGAR op? Split existing tag. + cigar[old_i++] = bam_cigar_gen(end - (pos-oplen), op); + new_cigar[j++] = bam_cigar_gen(pos-end, BAM_CSOFT_CLIP); + } else if (pos > end) { + // entirely off the chromosome; this will trigger CIGAR *, MQUAL 0 + b->core.flag |= BAM_FUNMAP; + b->core.flag &= ~BAM_FPROPER_PAIR; + } else { + // CIGAR op started on the trim junction + new_cigar[j++] = bam_cigar_gen(oplen, BAM_CSOFT_CLIP); + } + + // Replace trailing elements. + for (i++; i < n_cigar; i++) { + op = bam_cigar_op(cigar[i]); + oplen = bam_cigar_oplen(cigar[i]); + if (op == BAM_CHARD_CLIP) { + new_cigar[j++] = cigar[i]; + } else { + new_cigar[j-1] = + bam_cigar_gen(bam_cigar_oplen(new_cigar[j-1]) + oplen, + BAM_CSOFT_CLIP); + } + } + + // We now have cigar[0..old_i-1] for existing CIGAR + // and new_cigar[0..j-1] for new CIGAR trailing component. + + if (old_i+j == n_cigar) { + // Fits and no data move needed + memcpy(&cigar[old_i], new_cigar, j*4); + } else { + uint8_t *seq_old = bam_get_seq(b); + uint8_t *aux_end = b->data + b->l_data; + int nshift; + if (old_i+j < n_cigar) { + // Smaller, and can move data down + nshift = -4*(n_cigar - (old_i+j)); + } else { + // Bigger, so grow BAM and move data up + nshift = 4*(old_i+j - n_cigar); + // FIXME: make htslib's sam_realloc_bam_data public + if (b->l_data + nshift > b->m_data) { + uint8_t *new_data = realloc(b->data, b->l_data + nshift); + if (!new_data) { + if (new_cigar != new_cigar_a) + free(new_cigar); + return -1; + } + b->m_data = b->l_data + nshift; + if (b->data != new_data) { + b->data = new_data; + seq_old = bam_get_seq(b); + aux_end = b->data + b->l_data; + cigar = bam_get_cigar(b); + } + } + } + memmove(seq_old+nshift, seq_old, aux_end - seq_old); + b->l_data += nshift; + memcpy(&cigar[old_i], new_cigar, j*4); + b->core.n_cigar = old_i+j; + } + + if (new_cigar != new_cigar_a) + free(new_cigar); + + return 0; +} + +// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux" +// keywords for the bam sanitizer. +int bam_sanitize_options(const char *str) { + int opt = 0; + + while (str && *str) { + const char *str_start; + while(*str && *str == ',') + str++; + + for (str_start = str; *str && *str != ','; str++); + int len = str - str_start; + if (strncmp(str_start, "all", 3) == 0 || *str_start == '*') + opt = FIX_ALL; + else if (strncmp(str_start, "none", 4) == 0 || + strncmp(str_start, "off", 3) == 0) + opt = 0; + else if (strncmp(str_start, "on", 2) == 0) + // default for position sorted data + opt = FIX_MQUAL | FIX_UNMAP | FIX_CIGAR | FIX_AUX; + else if (strncmp(str_start, "pos", 3) == 0) + opt |= FIX_POS; + else if (strncmp(str_start, "mqual", 5) == 0) + opt |= FIX_MQUAL; + else if (strncmp(str_start, "unmap", 5) == 0) + opt |= FIX_UNMAP; + else if (strncmp(str_start, "cigar", 5) == 0) + opt |= FIX_CIGAR; + else if (strncmp(str_start, "aux", 3) == 0) + opt |= FIX_AUX; + else { + print_error("sanitize", "Unrecognised keyword %.*s\n", + len, str_start); + return -1; + } + } + + return opt; +} + +int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) { + if ((flags & FIX_POS) && b->core.tid < 0) { + // RNAME * => pos 0. NB can break alignment chr/pos sort order + b->core.pos = -1; + if (flags & FIX_UNMAP) + b->core.flag |= BAM_FUNMAP; + } + + if ((flags & FIX_CIGAR) && !(b->core.flag & BAM_FUNMAP)) { + // Mapped => unmapped correction + if (b->core.pos < 0 && (flags & FIX_UNMAP)) { + b->core.flag |= BAM_FUNMAP; + } else { + hts_pos_t cur_end, rlen = sam_hdr_tid2len(h, b->core.tid); + if (b->core.pos >= rlen && (flags & FIX_UNMAP)) { + b->core.flag |= BAM_FUNMAP; + if (flags & FIX_POS) + b->core.tid = b->core.pos = -1; + } else if ((cur_end = bam_endpos(b)) > rlen) { + if (bam_trim(b, rlen) < 0) + return -1; + } + } + } + + if (b->core.flag & BAM_FUNMAP) { + // Unmapped -> cigar/qual correctoins + if ((flags & FIX_CIGAR) && b->core.n_cigar > 0) + clear_cigar(b); + + if (flags & FIX_MQUAL) + b->core.qual = 0; + + // Remove NM, MD, CG, SM tags. + if (flags & FIX_AUX) { + uint8_t *from = bam_aux_first(b); + uint8_t *end = b->data + b->l_data; + uint8_t *to = from ? from-2 : end; + +#define XTAG(a) (((a)[0]<<8) + (a)[1]) + while (from) { + uint8_t *next = bam_aux_next(b, from); + if (!next && errno != ENOENT) + return -1; + + // Keep tag unless one of a specific set. + // NB "to" always points to an aux tag start, while + // "from" is after key. + from -= 2; + int key = (int)from[0]<<8 | from[1]; + if (key != XTAG("NM") && key != XTAG("MD") && + key != XTAG("CG") && key != XTAG("SM")) { + ptrdiff_t len = (next ? next-2 : end) - from; + if (from != to) + memmove(to, from, len); + to += len; + } + from = next; + } + b->l_data = to - b->data; + } + } + + return 0; +} + // currently, this function ONLY works if each read has one hit -static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg) +static int bam_mating_core(samFile *in, samFile *out, int remove_reads, + int proper_pair_check, int add_ct, + int do_mate_scoring, char *arg_list, int no_pg, + int sanitize_flags) { sam_hdr_t *header; bam1_t *b[2] = { NULL, NULL }; @@ -291,6 +513,8 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop curr = 0; has_prev = 0; while ((result = sam_read1(in, header, b[curr])) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; + if (bam_sanitize(header, cur, sanitize_flags) < 0) + goto fail; if (cur->core.flag & BAM_FSECONDARY) { if ( !remove_reads ) { @@ -303,17 +527,11 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop if (sam_write1(out, header, cur) < 0) goto write_fail; continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } - if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag - { - cur->core.flag |= BAM_FUNMAP; - } if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end { cur_end = bam_endpos(cur); - - // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag - if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP; } + if (has_prev) { // do we have a pair of reads to examine? if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; @@ -359,11 +577,6 @@ static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int prop } has_prev = 0; } else { // unpaired? clear bad info and write it out - if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped - pre->core.flag |= BAM_FUNMAP; - pre->core.tid = -1; - pre->core.pos = -1; - } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) { @@ -417,6 +630,8 @@ void usage(FILE* where) " -c Add template cigar ct tag\n" " -m Add mate score tag\n" " -u Uncompressed output\n" +" -z, --sanitize FLAG[,FLAG]\n" +" Sanitize alignment fields [defaults to all types]\n" " --no-PG do not add a PG line\n"); sam_global_opt_help(where, "-.O..@-."); @@ -432,7 +647,8 @@ int bam_mating(int argc, char *argv[]) { htsThreadPool p = {NULL, 0}; samFile *in = NULL, *out = NULL; - int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0; + int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, + mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; char wmode[4] = {'w', 'b', 0, 0}; static const struct option lopts[] = { @@ -444,17 +660,21 @@ int bam_mating(int argc, char *argv[]) // parse args if (argc == 1) { usage(samtools_stdout); return 0; } - while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "rpcmO:@:uz:", lopts, NULL)) >= 0) { switch (c) { - case 'r': remove_reads = 1; break; - case 'p': proper_pair_check = 0; break; - case 'c': add_ct = 1; break; - case 'm': mate_score = 1; break; - case 'u': wmode[2] = '0'; break; - case 1: no_pg = 1; break; - default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; - /* else fall-through */ - case '?': usage(samtools_stderr); goto fail; + case 'r': remove_reads = 1; break; + case 'p': proper_pair_check = 0; break; + case 'c': add_ct = 1; break; + case 'm': mate_score = 1; break; + case 'u': wmode[2] = '0'; break; + case 1: no_pg = 1; break; + default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage(samtools_stderr); goto fail; + case 'z': + if ((sanitize_flags = bam_sanitize_options(optarg)) < 0) + samtools_exit(1); + break; } } if (optind+1 >= argc) { usage(samtools_stderr); goto fail; } @@ -483,7 +703,8 @@ int bam_mating(int argc, char *argv[]) } // run - res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg); + res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, + mate_score, arg_list, no_pg, sanitize_flags); // cleanup sam_close(in); diff --git a/samtools/bam_md.c b/samtools/bam_md.c index 7d5aeaa..d7fd60f 100644 --- a/samtools/bam_md.c +++ b/samtools/bam_md.c @@ -1,6 +1,6 @@ /* bam_md.c -- calmd subcommand. - Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2015, 2019-2020, 2022 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/faidx.h" #include "htslib/sam.h" #include "htslib/kstring.h" @@ -45,6 +46,19 @@ DEALINGS IN THE SOFTWARE. */ #define UPDATE_MD 16 #define HASH_QNM 32 +typedef struct cached_ref_entry { + char *ref; + hts_pos_t len; +} cached_ref_entry; + +typedef struct ref_cache { + cached_ref_entry *refs; + char *last_ref; + hts_pos_t last_len; + int nref; + int last_tid; +} ref_cache; + int bam_aux_drop_other(bam1_t *b, uint8_t *s); static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref, @@ -214,6 +228,106 @@ int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode) return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL); } +// Get a new reference sequence. +// For position-sorted inputs, the previous reference should never be +// needed again and can be discarded to save memory. For other orderings, +// references are stored in a cache in case they're required in the future. +// The caching mode is turned on if the requested tid is less than the last +// one used, indicating the file ordering doesn't match the sequence dictionary. +static int get_ref(faidx_t *fai, sam_hdr_t *header, ref_cache *cache, + int tid, char **ref_out, const char **ref_name_out, + hts_pos_t *len_out) +{ + char *ref = NULL; + const char *ref_name; + hts_pos_t len = 0; + + // This should only be called when tid changes + assert(tid != cache->last_tid); + + // Array lookup, should be fast + ref_name = sam_hdr_tid2name(header, tid); + *ref_name_out = ref_name; + + // Return a cached entry, if available + if (cache->refs && tid >= 0 && tid < cache->nref + && cache->refs[tid].ref) { + assert(cache->last_ref == NULL); + *ref_out = cache->refs[tid].ref; + *len_out = cache->refs[tid].len; + cache->last_tid = tid; + return 0; + } + + // Try to get the reference + if (ref_name) + ref = fai_fetch64(fai, ref_name, &len); + + if (!ref) { + // Historically, calmd doesn't worry too much about missing refs + *ref_out = NULL; + *len_out = 0; + return 0; + } + + if (!cache->refs && cache->last_tid > tid) { + // Going backwards throught the list of tids implies + // a non-position-ordered file, so turn on caching mode + cache->nref = sam_hdr_nref(header); + if (cache->nref < 0) { + print_error("calmd", "couldn't get number of refs from header"); + return -1; + } + if (cache->nref > 0) { + cache->refs = calloc(cache->nref, sizeof(cache->refs[0])); + if (!cache->refs) { + print_error_errno("calmd", + "couldn't allocate reference cache"); + return -1; + } + // Add the reference we already have as the first entry + if (cache->last_tid >= 0 && cache->last_tid < cache->nref) { + cache->refs[cache->last_tid].ref = cache->last_ref; + cache->refs[cache->last_tid].len = cache->last_len; + } else { + free(cache->last_ref); + } + cache->last_ref = NULL; + } + } + + if (cache->refs) { + assert(cache->last_ref == NULL); // Shouldn't be set when caching + // Add the new reference to the cache + if (tid >= 0 && tid < cache->nref) { + cache->refs[tid].ref = ref; + cache->refs[tid].len = len; + } + } else { + // Streaming mode - free the last ref and replace it with this one + free(cache->last_ref); + cache->last_ref = ref; + cache->last_len = len; + } + + *ref_out = ref; + *len_out = len; + cache->last_tid = tid; + return 0; +} + +static void refs_destroy(ref_cache *cache) { + if (cache->refs) { + int i; + assert(cache->last_ref == NULL); + for (i = 0; i < cache->nref; i++) + free(cache->refs[i].ref); + free(cache->refs); + } else { + free(cache->last_ref); + } +} + int calmd_usage() { fprintf(stderr, "Usage: samtools calmd [-eubrAESQ] \n" @@ -234,13 +348,14 @@ int calmd_usage() { int bam_fillmd(int argc, char *argv[]) { - int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; - hts_pos_t len; + int c, flt_flag, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; + hts_pos_t len = 0; htsThreadPool p = {NULL, 0}; samFile *fp = NULL, *fpout = NULL; sam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; + ref_cache refs = { NULL, NULL, 0, 0, -2 }; const char *ref_name = NULL; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -342,15 +457,11 @@ int bam_fillmd(int argc, char *argv[]) } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { - if (tid != b->core.tid) { - free(ref); - ref = NULL; - len = 0; - ref_name = sam_hdr_tid2name(header, b->core.tid); - if (ref_name) { - ref = fai_fetch64(fai, ref_name, &len); + if (refs.last_tid != b->core.tid) { + if (get_ref(fai, header, &refs, b->core.tid, + &ref, &ref_name, &len) < 0) { + goto fail; } - tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", ref_name ? ref_name : "(unknown)"); @@ -393,7 +504,7 @@ int bam_fillmd(int argc, char *argv[]) sam_hdr_destroy(header); free(arg_list); - free(ref); + refs_destroy(&refs); fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { @@ -406,7 +517,7 @@ int bam_fillmd(int argc, char *argv[]) fail: free(arg_list); - free(ref); + refs_destroy(&refs); if (b) bam_destroy1(b); if (header) sam_hdr_destroy(header); if (fai) fai_destroy(fai); diff --git a/samtools/bam_md.c.pysam.c b/samtools/bam_md.c.pysam.c index b71e77c..0daf177 100644 --- a/samtools/bam_md.c.pysam.c +++ b/samtools/bam_md.c.pysam.c @@ -2,7 +2,7 @@ /* bam_md.c -- calmd subcommand. - Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2015, 2019-2020, 2022 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include "htslib/faidx.h" #include "htslib/sam.h" #include "htslib/kstring.h" @@ -47,6 +48,19 @@ DEALINGS IN THE SOFTWARE. */ #define UPDATE_MD 16 #define HASH_QNM 32 +typedef struct cached_ref_entry { + char *ref; + hts_pos_t len; +} cached_ref_entry; + +typedef struct ref_cache { + cached_ref_entry *refs; + char *last_ref; + hts_pos_t last_len; + int nref; + int last_tid; +} ref_cache; + int bam_aux_drop_other(bam1_t *b, uint8_t *s); static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref, @@ -216,6 +230,106 @@ int bam_fillmd1(bam1_t *b, char *ref, int flag, int quiet_mode) return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL); } +// Get a new reference sequence. +// For position-sorted inputs, the previous reference should never be +// needed again and can be discarded to save memory. For other orderings, +// references are stored in a cache in case they're required in the future. +// The caching mode is turned on if the requested tid is less than the last +// one used, indicating the file ordering doesn't match the sequence dictionary. +static int get_ref(faidx_t *fai, sam_hdr_t *header, ref_cache *cache, + int tid, char **ref_out, const char **ref_name_out, + hts_pos_t *len_out) +{ + char *ref = NULL; + const char *ref_name; + hts_pos_t len = 0; + + // This should only be called when tid changes + assert(tid != cache->last_tid); + + // Array lookup, should be fast + ref_name = sam_hdr_tid2name(header, tid); + *ref_name_out = ref_name; + + // Return a cached entry, if available + if (cache->refs && tid >= 0 && tid < cache->nref + && cache->refs[tid].ref) { + assert(cache->last_ref == NULL); + *ref_out = cache->refs[tid].ref; + *len_out = cache->refs[tid].len; + cache->last_tid = tid; + return 0; + } + + // Try to get the reference + if (ref_name) + ref = fai_fetch64(fai, ref_name, &len); + + if (!ref) { + // Historically, calmd doesn't worry too much about missing refs + *ref_out = NULL; + *len_out = 0; + return 0; + } + + if (!cache->refs && cache->last_tid > tid) { + // Going backwards throught the list of tids implies + // a non-position-ordered file, so turn on caching mode + cache->nref = sam_hdr_nref(header); + if (cache->nref < 0) { + print_error("calmd", "couldn't get number of refs from header"); + return -1; + } + if (cache->nref > 0) { + cache->refs = calloc(cache->nref, sizeof(cache->refs[0])); + if (!cache->refs) { + print_error_errno("calmd", + "couldn't allocate reference cache"); + return -1; + } + // Add the reference we already have as the first entry + if (cache->last_tid >= 0 && cache->last_tid < cache->nref) { + cache->refs[cache->last_tid].ref = cache->last_ref; + cache->refs[cache->last_tid].len = cache->last_len; + } else { + free(cache->last_ref); + } + cache->last_ref = NULL; + } + } + + if (cache->refs) { + assert(cache->last_ref == NULL); // Shouldn't be set when caching + // Add the new reference to the cache + if (tid >= 0 && tid < cache->nref) { + cache->refs[tid].ref = ref; + cache->refs[tid].len = len; + } + } else { + // Streaming mode - free the last ref and replace it with this one + free(cache->last_ref); + cache->last_ref = ref; + cache->last_len = len; + } + + *ref_out = ref; + *len_out = len; + cache->last_tid = tid; + return 0; +} + +static void refs_destroy(ref_cache *cache) { + if (cache->refs) { + int i; + assert(cache->last_ref == NULL); + for (i = 0; i < cache->nref; i++) + free(cache->refs[i].ref); + free(cache->refs); + } else { + free(cache->last_ref); + } +} + int calmd_usage() { fprintf(samtools_stderr, "Usage: samtools calmd [-eubrAESQ] \n" @@ -236,13 +350,14 @@ int calmd_usage() { int bam_fillmd(int argc, char *argv[]) { - int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; - hts_pos_t len; + int c, flt_flag, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0; + hts_pos_t len = 0; htsThreadPool p = {NULL, 0}; samFile *fp = NULL, *fpout = NULL; sam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL; + ref_cache refs = { NULL, NULL, 0, 0, -2 }; const char *ref_name = NULL; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; @@ -344,15 +459,11 @@ int bam_fillmd(int argc, char *argv[]) } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { - if (tid != b->core.tid) { - free(ref); - ref = NULL; - len = 0; - ref_name = sam_hdr_tid2name(header, b->core.tid); - if (ref_name) { - ref = fai_fetch64(fai, ref_name, &len); + if (refs.last_tid != b->core.tid) { + if (get_ref(fai, header, &refs, b->core.tid, + &ref, &ref_name, &len) < 0) { + goto fail; } - tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", ref_name ? ref_name : "(unknown)"); @@ -395,7 +506,7 @@ int bam_fillmd(int argc, char *argv[]) sam_hdr_destroy(header); free(arg_list); - free(ref); + refs_destroy(&refs); fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { @@ -408,7 +519,7 @@ int bam_fillmd(int argc, char *argv[]) fail: free(arg_list); - free(ref); + refs_destroy(&refs); if (b) bam_destroy1(b); if (header) sam_hdr_destroy(header); if (fai) fai_destroy(fai); diff --git a/samtools/bam_plcmd.c b/samtools/bam_plcmd.c index c73bf89..264a7f5 100644 --- a/samtools/bam_plcmd.c +++ b/samtools/bam_plcmd.c @@ -941,9 +941,12 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -r, --region REG region in which pileup is generated\n" " -R, --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require); +" --rf, --incl-flags STR|INT\n" +" required flags: only include reads with any of\n" +" the mask bits set [%s]\n", tmp_require); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" +" --ff, --excl-flags STR|INT\n" +" filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, " -x, --ignore-overlaps-removal, --disable-overlap-removal\n" diff --git a/samtools/bam_plcmd.c.pysam.c b/samtools/bam_plcmd.c.pysam.c index 8147e85..009867e 100644 --- a/samtools/bam_plcmd.c.pysam.c +++ b/samtools/bam_plcmd.c.pysam.c @@ -943,9 +943,12 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp) fprintf(fp, " -r, --region REG region in which pileup is generated\n" " -R, --ignore-RG ignore RG tags (one BAM = one sample)\n" -" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require); +" --rf, --incl-flags STR|INT\n" +" required flags: only include reads with any of\n" +" the mask bits set [%s]\n", tmp_require); fprintf(fp, -" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n" +" --ff, --excl-flags STR|INT\n" +" filter flags: skip reads with any of the mask bits set\n" " [%s]\n", tmp_filter); fprintf(fp, " -x, --ignore-overlaps-removal, --disable-overlap-removal\n" diff --git a/samtools/bam_sort.c b/samtools/bam_sort.c index 58ecdfd..875e29c 100644 --- a/samtools/bam_sort.c +++ b/samtools/bam_sort.c @@ -48,11 +48,14 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "htslib/hts_endian.h" #include "htslib/cram.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "samtools.h" #include "bedidx.h" #include "bam.h" +#define BAM_BLOCK_SIZE 2*1024*1024 +#define MAX_TMP_FILES 64 // Struct which contains the sorting key for TemplateCoordinate sort. typedef struct { @@ -161,25 +164,36 @@ typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, Templ static SamOrder g_sam_order = Coordinate; static char g_sort_tag[2] = {0,0}; +#define is_digit(c) ((c)<='9' && (c)>='0') static int strnum_cmp(const char *_a, const char *_b) { const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; const unsigned char *pa = a, *pb = b; while (*pa && *pb) { - if (isdigit(*pa) && isdigit(*pb)) { + if (!is_digit(*pa) || !is_digit(*pb)) { + if (*pa != *pb) + return (int)*pa - (int)*pb; + ++pa; ++pb; + } else { + // skip leading zeros while (*pa == '0') ++pa; while (*pb == '0') ++pb; - while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb; - if (isdigit(*pa) && isdigit(*pb)) { - int i = 0; - while (isdigit(pa[i]) && isdigit(pb[i])) ++i; - return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb; - } else if (isdigit(*pa)) return 1; - else if (isdigit(*pb)) return -1; - else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1; - } else { - if (*pa != *pb) return (int)*pa - (int)*pb; - ++pa; ++pb; + + // skip matching digits + while (is_digit(*pa) && *pa == *pb) + pa++, pb++; + + // Now mismatching, so see which ends the number sooner + int diff = (int)*pa - (int)*pb; + while (is_digit(*pa) && is_digit(*pb)) + pa++, pb++; + + if (is_digit(*pa)) + return 1; // pa still going, so larger + else if (is_digit(*pb)) + return -1; // pb still going, so larger + else if (diff) + return diff; // same length, so earlier diff } } return *pa? 1 : *pb? -1 : 0; @@ -1165,6 +1179,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c print_error_errno(cmd, "fail to open \"%s\"", fn[i]); goto fail; } + hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); hin = sam_hdr_read(fp[i]); if (hin == NULL) { print_error(cmd, "failed to read header from \"%s\"", fn[i]); @@ -1362,6 +1377,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c print_error_errno(cmd, "failed to create \"%s\"", out); return -1; } + hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); if (!no_pg && sam_hdr_add_pg(hout, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, @@ -1763,7 +1779,8 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, int n, char * const *fn, int num_in_mem, buf_region *in_mem, bam1_tag *buf, template_coordinate_keys_t *keys, - khash_t(const_c2c) *lib_lookup, int n_threads, + khash_t(const_c2c) *lib_lookup, + htsThreadPool *htspool, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -1800,6 +1817,9 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, print_error_errno(cmd, "fail to open \"%s\"", fn[i]); goto fail; } + hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); + if (htspool->pool) + hts_set_opt(fp[i], HTS_OPT_THREAD_POOL, htspool); // Read header ... hin = sam_hdr_read(fp[i]); @@ -1832,6 +1852,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, print_error_errno(cmd, "failed to create \"%s\"", out); return -1; } + hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); if (!no_pg && sam_hdr_add_pg(hout, "samtools", "VN", samtools_version(), @@ -1843,7 +1864,8 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, return -1; } - if (n_threads > 1) hts_set_threads(fpout, n_threads); + if (htspool->pool) + hts_set_opt(fpout, HTS_OPT_THREAD_POOL, htspool); if (sam_hdr_write(fpout, hout) != 0) { print_error_errno(cmd, "failed to write header to \"%s\"", out); @@ -2216,13 +2238,9 @@ KSORT_INIT(sort, bam1_tag, bam1_lt) typedef struct { size_t buf_len; - const char *prefix; bam1_tag *buf; const sam_hdr_t *h; - char *tmpfile_name; - int index; int error; - int no_save; int large_pos; int minimiser_kmer; } worker_t; @@ -2239,6 +2257,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu fp = sam_open_format(fn, mode, fmt); if (fp == NULL) return -1; + hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, @@ -2555,10 +2574,7 @@ static inline void worker_minhash(worker_t *w) { static void *worker(void *data) { worker_t *w = (worker_t*)data; - char *name; - size_t name_len; w->error = 0; - w->tmpfile_name = NULL; switch (g_sam_order) { case Coordinate: @@ -2574,45 +2590,12 @@ static void *worker(void *data) ks_mergesort(sort, w->buf_len, w->buf, 0); } - if (w->no_save) - return 0; - - name_len = strlen(w->prefix) + 30; - name = (char*)calloc(name_len, 1); - if (!name) { w->error = errno; return 0; } - const int MAX_TRIES = 1000; - int tries = 0; - for (;;) { - if (tries) { - snprintf(name, name_len, "%s.%.4d-%.3d.bam", - w->prefix, w->index, tries); - } else { - snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index); - } - - if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1", - w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) { - break; - } - if (errno == EEXIST && tries < MAX_TRIES) { - tries++; - } else { - w->error = errno; - break; - } - } - - if (w->error) { - free(name); - } else { - w->tmpfile_name = name; - } return 0; } -static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, - const sam_hdr_t *h, int n_threads, buf_region *in_mem, - int large_pos, int minimiser_kmer, char **fns, size_t fns_size) +static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h, + int n_threads, buf_region *in_mem, + int large_pos, int minimiser_kmer) { int i; size_t pos, rest; @@ -2633,49 +2616,26 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, for (i = 0; i < n_threads; ++i) { w[i].buf_len = rest / (n_threads - i); w[i].buf = &buf[pos]; - w[i].prefix = prefix; w[i].h = h; - w[i].index = n_files + i; - w[i].tmpfile_name = NULL; w[i].large_pos = large_pos; w[i].minimiser_kmer = minimiser_kmer; - if (in_mem) { - w[i].no_save = 1; - in_mem[i].from = pos; - in_mem[i].to = pos + w[i].buf_len; - } else { - w[i].no_save = 0; - } + in_mem[i].from = pos; + in_mem[i].to = pos + w[i].buf_len; pos += w[i].buf_len; rest -= w[i].buf_len; pthread_create(&tid[i], &attr, worker, &w[i]); } for (i = 0; i < n_threads; ++i) { pthread_join(tid[i], 0); - if (!in_mem) { - assert(w[i].index >= 0 && w[i].index < fns_size); - fns[w[i].index] = w[i].tmpfile_name; - } if (w[i].error != 0) { errno = w[i].error; - print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index); + print_error_errno("sort", "failed to sort block %d", i); n_failed++; } } - if (n_failed && !in_mem) { - // Clean up any temporary files that did get made, as we're - // about to lose track of them - for (i = 0; i < n_threads; ++i) { - if (fns[w[i].index]) { - unlink(fns[w[i].index]); - free(fns[w[i].index]); - fns[w[i].index] = NULL; - } - } - } - free(tid); free(w); - if (n_failed) return -1; - if (in_mem) return n_threads; - return n_files + n_threads; + free(w); + free(tid); + + return n_failed ? -1 : n_threads; } static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) { @@ -2763,7 +2723,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { - int ret = -1, res, i, nref, n_files = 0; + int ret = -1, res, i, nref, n_files = 0, n_big_files = 0, fn_counter = 0; size_t max_k, k, max_mem, bam_mem_offset; sam_hdr_t *header = NULL; samFile *fp = NULL; @@ -2778,6 +2738,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, const char *new_ss = NULL; buf_region *in_mem = NULL; khash_t(const_c2c) *lib_lookup = NULL; + htsThreadPool htspool = { NULL, 0 }; int num_in_mem = 0; int large_pos = 0; @@ -2811,6 +2772,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, print_error_errno("sort", "can't open \"%s\"", fn); goto err; } + hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); header = sam_hdr_read(fp); if (header == NULL) { print_error("sort", "failed to read header from \"%s\"", fn); @@ -2919,19 +2881,26 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, } } - // No gain to using the thread pool here as the flow of this code - // is such that we are *either* reading *or* sorting. Hence a shared - // pool makes no real difference except to reduce the thread count a little. - if (n_threads > 1) - hts_set_threads(fp, n_threads); + if (n_threads > 1) { + htspool.pool = hts_tpool_init(n_threads); + if (!htspool.pool) { + print_error_errno("sort", "failed to set up thread pool"); + goto err; + } + hts_set_opt(fp, HTS_OPT_THREAD_POOL, &htspool); + } if ((bam_mem = malloc(max_mem)) == NULL) { print_error("sort", "couldn't allocate memory for bam_mem"); goto err; } + in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); + if (!in_mem) goto err; + // write sub files k = max_k = bam_mem_offset = 0; + size_t name_len = strlen(prefix) + 30; while ((res = sam_read1(fp, header, b)) >= 0) { int mem_full = 0; @@ -2985,19 +2954,73 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, ++k; if (mem_full) { - if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1), - &fns_size, &fns, 0) < 0) + if (hts_resize(char *, n_files + 1, &fns_size, &fns, 0) < 0) goto err; - int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads, - NULL, large_pos, minimiser_kmer, fns, fns_size); - if (new_n < 0) { + + int sort_res = sort_blocks(k, buf, header, n_threads, + in_mem, large_pos, minimiser_kmer); + if (sort_res < 0) + goto err; + + fns[n_files] = calloc(name_len, 1); + if (!fns[n_files]) + goto err; + const int MAX_TRIES = 1000; + int tries = 0, merge_res = -1; + char *sort_by_tag = (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) ? sort_tag : NULL; + int consolidate_from = n_files; + if (n_files - n_big_files >= MAX_TMP_FILES/2) + consolidate_from = n_big_files; + else if (n_files >= MAX_TMP_FILES) + consolidate_from = 0; + + for (;;) { + if (tries) { + snprintf(fns[n_files], name_len, "%s.%.4d-%.3d.bam", + prefix, fn_counter, tries); + } else { + snprintf(fns[n_files], name_len, "%s.%.4d.bam", prefix, + fn_counter); + } + if (bam_merge_simple(g_sam_order, sort_by_tag, fns[n_files], + large_pos ? "wzx1" : "wbx1", header, + n_files - consolidate_from, + &fns[consolidate_from], n_threads, + in_mem, buf, keys, + lib_lookup, &htspool, "sort", NULL, NULL, + NULL, 1, 0) >= 0) { + merge_res = 0; + break; + } + if (errno == EEXIST && tries < MAX_TRIES) { + tries++; + } else { + break; + } + } + fn_counter++; + if (merge_res < 0) { + if (errno != EEXIST) + unlink(fns[n_files]); + free(fns[n_files]); goto err; - } else { - n_files = new_n; } + + if (consolidate_from < n_files) { + for (i = consolidate_from; i < n_files; i++) { + unlink(fns[i]); + free(fns[i]); + } + fns[consolidate_from] = fns[n_files]; + n_files = consolidate_from; + n_big_files = consolidate_from + 1; + } + + n_files++; k = 0; if (keys != NULL) keys->n = 0; bam_mem_offset = 0; + } } if (res != -1) { @@ -3007,10 +3030,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, // Sort last records if (k > 0) { - in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); - if (!in_mem) goto err; - num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, - in_mem, large_pos, minimiser_kmer, fns, fns_size); + num_in_mem = sort_blocks(k, buf, header, n_threads, + in_mem, large_pos, minimiser_kmer); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -3038,7 +3059,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL; if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header, n_files, fns, num_in_mem, in_mem, buf, keys, - lib_lookup, n_threads, "sort", in_fmt, out_fmt, + lib_lookup, &htspool, "sort", in_fmt, out_fmt, arg_list, no_pg, write_index) < 0) { // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. @@ -3073,6 +3094,9 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, lib_lookup_destroy(lib_lookup); sam_hdr_destroy(header); if (fp) sam_close(fp); + if (htspool.pool) + hts_tpool_destroy(htspool.pool); + return ret; } diff --git a/samtools/bam_sort.c.pysam.c b/samtools/bam_sort.c.pysam.c index 3489044..4353f61 100644 --- a/samtools/bam_sort.c.pysam.c +++ b/samtools/bam_sort.c.pysam.c @@ -50,11 +50,14 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/sam.h" #include "htslib/hts_endian.h" #include "htslib/cram.h" +#include "htslib/thread_pool.h" #include "sam_opts.h" #include "samtools.h" #include "bedidx.h" #include "bam.h" +#define BAM_BLOCK_SIZE 2*1024*1024 +#define MAX_TMP_FILES 64 // Struct which contains the sorting key for TemplateCoordinate sort. typedef struct { @@ -163,25 +166,36 @@ typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, Templ static SamOrder g_sam_order = Coordinate; static char g_sort_tag[2] = {0,0}; +#define is_digit(c) ((c)<='9' && (c)>='0') static int strnum_cmp(const char *_a, const char *_b) { const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; const unsigned char *pa = a, *pb = b; while (*pa && *pb) { - if (isdigit(*pa) && isdigit(*pb)) { + if (!is_digit(*pa) || !is_digit(*pb)) { + if (*pa != *pb) + return (int)*pa - (int)*pb; + ++pa; ++pb; + } else { + // skip leading zeros while (*pa == '0') ++pa; while (*pb == '0') ++pb; - while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb; - if (isdigit(*pa) && isdigit(*pb)) { - int i = 0; - while (isdigit(pa[i]) && isdigit(pb[i])) ++i; - return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb; - } else if (isdigit(*pa)) return 1; - else if (isdigit(*pb)) return -1; - else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1; - } else { - if (*pa != *pb) return (int)*pa - (int)*pb; - ++pa; ++pb; + + // skip matching digits + while (is_digit(*pa) && *pa == *pb) + pa++, pb++; + + // Now mismatching, so see which ends the number sooner + int diff = (int)*pa - (int)*pb; + while (is_digit(*pa) && is_digit(*pb)) + pa++, pb++; + + if (is_digit(*pa)) + return 1; // pa still going, so larger + else if (is_digit(*pb)) + return -1; // pb still going, so larger + else if (diff) + return diff; // same length, so earlier diff } } return *pa? 1 : *pb? -1 : 0; @@ -1167,6 +1181,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c print_error_errno(cmd, "fail to open \"%s\"", fn[i]); goto fail; } + hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); hin = sam_hdr_read(fp[i]); if (hin == NULL) { print_error(cmd, "failed to read header from \"%s\"", fn[i]); @@ -1364,6 +1379,7 @@ int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const c print_error_errno(cmd, "failed to create \"%s\"", out); return -1; } + hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); if (!no_pg && sam_hdr_add_pg(hout, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, @@ -1765,7 +1781,8 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, int n, char * const *fn, int num_in_mem, buf_region *in_mem, bam1_tag *buf, template_coordinate_keys_t *keys, - khash_t(const_c2c) *lib_lookup, int n_threads, + khash_t(const_c2c) *lib_lookup, + htsThreadPool *htspool, const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { @@ -1802,6 +1819,9 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, print_error_errno(cmd, "fail to open \"%s\"", fn[i]); goto fail; } + hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); + if (htspool->pool) + hts_set_opt(fp[i], HTS_OPT_THREAD_POOL, htspool); // Read header ... hin = sam_hdr_read(fp[i]); @@ -1834,6 +1854,7 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, print_error_errno(cmd, "failed to create \"%s\"", out); return -1; } + hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); if (!no_pg && sam_hdr_add_pg(hout, "samtools", "VN", samtools_version(), @@ -1845,7 +1866,8 @@ static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out, return -1; } - if (n_threads > 1) hts_set_threads(fpout, n_threads); + if (htspool->pool) + hts_set_opt(fpout, HTS_OPT_THREAD_POOL, htspool); if (sam_hdr_write(fpout, hout) != 0) { print_error_errno(cmd, "failed to write header to \"%s\"", out); @@ -2218,13 +2240,9 @@ KSORT_INIT(sort, bam1_tag, bam1_lt) typedef struct { size_t buf_len; - const char *prefix; bam1_tag *buf; const sam_hdr_t *h; - char *tmpfile_name; - int index; int error; - int no_save; int large_pos; int minimiser_kmer; } worker_t; @@ -2241,6 +2259,7 @@ static int write_buffer(const char *fn, const char *mode, size_t l, bam1_tag *bu fp = sam_open_format(fn, mode, fmt); if (fp == NULL) return -1; + hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, @@ -2557,10 +2576,7 @@ static inline void worker_minhash(worker_t *w) { static void *worker(void *data) { worker_t *w = (worker_t*)data; - char *name; - size_t name_len; w->error = 0; - w->tmpfile_name = NULL; switch (g_sam_order) { case Coordinate: @@ -2576,45 +2592,12 @@ static void *worker(void *data) ks_mergesort(sort, w->buf_len, w->buf, 0); } - if (w->no_save) - return 0; - - name_len = strlen(w->prefix) + 30; - name = (char*)calloc(name_len, 1); - if (!name) { w->error = errno; return 0; } - const int MAX_TRIES = 1000; - int tries = 0; - for (;;) { - if (tries) { - snprintf(name, name_len, "%s.%.4d-%.3d.bam", - w->prefix, w->index, tries); - } else { - snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index); - } - - if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1", - w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) { - break; - } - if (errno == EEXIST && tries < MAX_TRIES) { - tries++; - } else { - w->error = errno; - break; - } - } - - if (w->error) { - free(name); - } else { - w->tmpfile_name = name; - } return 0; } -static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, - const sam_hdr_t *h, int n_threads, buf_region *in_mem, - int large_pos, int minimiser_kmer, char **fns, size_t fns_size) +static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h, + int n_threads, buf_region *in_mem, + int large_pos, int minimiser_kmer) { int i; size_t pos, rest; @@ -2635,49 +2618,26 @@ static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix, for (i = 0; i < n_threads; ++i) { w[i].buf_len = rest / (n_threads - i); w[i].buf = &buf[pos]; - w[i].prefix = prefix; w[i].h = h; - w[i].index = n_files + i; - w[i].tmpfile_name = NULL; w[i].large_pos = large_pos; w[i].minimiser_kmer = minimiser_kmer; - if (in_mem) { - w[i].no_save = 1; - in_mem[i].from = pos; - in_mem[i].to = pos + w[i].buf_len; - } else { - w[i].no_save = 0; - } + in_mem[i].from = pos; + in_mem[i].to = pos + w[i].buf_len; pos += w[i].buf_len; rest -= w[i].buf_len; pthread_create(&tid[i], &attr, worker, &w[i]); } for (i = 0; i < n_threads; ++i) { pthread_join(tid[i], 0); - if (!in_mem) { - assert(w[i].index >= 0 && w[i].index < fns_size); - fns[w[i].index] = w[i].tmpfile_name; - } if (w[i].error != 0) { errno = w[i].error; - print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index); + print_error_errno("sort", "failed to sort block %d", i); n_failed++; } } - if (n_failed && !in_mem) { - // Clean up any temporary files that did get made, as we're - // about to lose track of them - for (i = 0; i < n_threads; ++i) { - if (fns[w[i].index]) { - unlink(fns[w[i].index]); - free(fns[w[i].index]); - fns[w[i].index] = NULL; - } - } - } - free(tid); free(w); - if (n_failed) return -1; - if (in_mem) return n_threads; - return n_files + n_threads; + free(w); + free(tid); + + return n_failed ? -1 : n_threads; } static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) { @@ -2765,7 +2725,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, const htsFormat *in_fmt, const htsFormat *out_fmt, char *arg_list, int no_pg, int write_index) { - int ret = -1, res, i, nref, n_files = 0; + int ret = -1, res, i, nref, n_files = 0, n_big_files = 0, fn_counter = 0; size_t max_k, k, max_mem, bam_mem_offset; sam_hdr_t *header = NULL; samFile *fp = NULL; @@ -2780,6 +2740,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, const char *new_ss = NULL; buf_region *in_mem = NULL; khash_t(const_c2c) *lib_lookup = NULL; + htsThreadPool htspool = { NULL, 0 }; int num_in_mem = 0; int large_pos = 0; @@ -2813,6 +2774,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, print_error_errno("sort", "can't open \"%s\"", fn); goto err; } + hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE); header = sam_hdr_read(fp); if (header == NULL) { print_error("sort", "failed to read header from \"%s\"", fn); @@ -2921,19 +2883,26 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, } } - // No gain to using the thread pool here as the flow of this code - // is such that we are *either* reading *or* sorting. Hence a shared - // pool makes no real difference except to reduce the thread count a little. - if (n_threads > 1) - hts_set_threads(fp, n_threads); + if (n_threads > 1) { + htspool.pool = hts_tpool_init(n_threads); + if (!htspool.pool) { + print_error_errno("sort", "failed to set up thread pool"); + goto err; + } + hts_set_opt(fp, HTS_OPT_THREAD_POOL, &htspool); + } if ((bam_mem = malloc(max_mem)) == NULL) { print_error("sort", "couldn't allocate memory for bam_mem"); goto err; } + in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); + if (!in_mem) goto err; + // write sub files k = max_k = bam_mem_offset = 0; + size_t name_len = strlen(prefix) + 30; while ((res = sam_read1(fp, header, b)) >= 0) { int mem_full = 0; @@ -2987,19 +2956,73 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, ++k; if (mem_full) { - if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1), - &fns_size, &fns, 0) < 0) + if (hts_resize(char *, n_files + 1, &fns_size, &fns, 0) < 0) goto err; - int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads, - NULL, large_pos, minimiser_kmer, fns, fns_size); - if (new_n < 0) { + + int sort_res = sort_blocks(k, buf, header, n_threads, + in_mem, large_pos, minimiser_kmer); + if (sort_res < 0) + goto err; + + fns[n_files] = calloc(name_len, 1); + if (!fns[n_files]) + goto err; + const int MAX_TRIES = 1000; + int tries = 0, merge_res = -1; + char *sort_by_tag = (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) ? sort_tag : NULL; + int consolidate_from = n_files; + if (n_files - n_big_files >= MAX_TMP_FILES/2) + consolidate_from = n_big_files; + else if (n_files >= MAX_TMP_FILES) + consolidate_from = 0; + + for (;;) { + if (tries) { + snprintf(fns[n_files], name_len, "%s.%.4d-%.3d.bam", + prefix, fn_counter, tries); + } else { + snprintf(fns[n_files], name_len, "%s.%.4d.bam", prefix, + fn_counter); + } + if (bam_merge_simple(g_sam_order, sort_by_tag, fns[n_files], + large_pos ? "wzx1" : "wbx1", header, + n_files - consolidate_from, + &fns[consolidate_from], n_threads, + in_mem, buf, keys, + lib_lookup, &htspool, "sort", NULL, NULL, + NULL, 1, 0) >= 0) { + merge_res = 0; + break; + } + if (errno == EEXIST && tries < MAX_TRIES) { + tries++; + } else { + break; + } + } + fn_counter++; + if (merge_res < 0) { + if (errno != EEXIST) + unlink(fns[n_files]); + free(fns[n_files]); goto err; - } else { - n_files = new_n; } + + if (consolidate_from < n_files) { + for (i = consolidate_from; i < n_files; i++) { + unlink(fns[i]); + free(fns[i]); + } + fns[consolidate_from] = fns[n_files]; + n_files = consolidate_from; + n_big_files = consolidate_from + 1; + } + + n_files++; k = 0; if (keys != NULL) keys->n = 0; bam_mem_offset = 0; + } } if (res != -1) { @@ -3009,10 +3032,8 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, // Sort last records if (k > 0) { - in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0])); - if (!in_mem) goto err; - num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads, - in_mem, large_pos, minimiser_kmer, fns, fns_size); + num_in_mem = sort_blocks(k, buf, header, n_threads, + in_mem, large_pos, minimiser_kmer); if (num_in_mem < 0) goto err; } else { num_in_mem = 0; @@ -3040,7 +3061,7 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL; if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header, n_files, fns, num_in_mem, in_mem, buf, keys, - lib_lookup, n_threads, "sort", in_fmt, out_fmt, + lib_lookup, &htspool, "sort", in_fmt, out_fmt, arg_list, no_pg, write_index) < 0) { // Propagate bam_merge_simple() failure; it has already emitted a // message explaining the failure, so no further message is needed. @@ -3075,6 +3096,9 @@ int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer, lib_lookup_destroy(lib_lookup); sam_hdr_destroy(header); if (fp) sam_close(fp); + if (htspool.pool) + hts_tpool_destroy(htspool.pool); + return ret; } diff --git a/samtools/bamshuf.c b/samtools/bamshuf.c index 53e382d..05442bf 100644 --- a/samtools/bamshuf.c +++ b/samtools/bamshuf.c @@ -1,7 +1,7 @@ /* bamshuf.c -- collate subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013, 2015-2019 Genome Research Ltd. + Copyright (C) 2013, 2015-2019,2023 Genome Research Ltd. Author: Heng Li @@ -201,7 +201,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, // Read input, distribute reads pseudo-randomly into n_files temporary // files. - fp = sam_open_format(fn, "r", &ga->in); + fp = sam_open_format(fn ? fn : "-", "r", &ga->in); if (fp == NULL) { print_error_errno("collate", "Cannot open input file \"%s\"", fn); return 1; @@ -527,15 +527,17 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, static int usage(FILE *fp, int n_files, int reads_store) { fprintf(fp, - "Usage: samtools collate [-Ou] [-o ] [-n nFiles] [-l cLevel] []\n\n" + "Usage: samtools collate [options...] []\n\n" "Options:\n" - " -O output to stdout\n" - " -o output file name (use prefix if not set)\n" - " -u uncompressed BAM output\n" - " -f fast (only primary alignments)\n" - " -r working reads stored (with -f) [%d]\n" // reads_store - " -l INT compression level [%d]\n" // DEF_CLEVEL - " -n INT number of temporary files [%d]\n" // n_files + " -O Output to stdout\n" + " -o Output file name (use prefix if not set)\n" + " -u Uncompressed BAM output\n" + " -f Fast (only primary alignments)\n" + " -r Working reads stored (with -f) [%d]\n" // reads_store + " -l INT Compression level [%d]\n" // DEF_CLEVEL + " -n INT Number of temporary files [%d]\n" // n_files + " -T PREFIX\n" + " Write tempory files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n", reads_store, DEF_CLEVEL, n_files); @@ -546,9 +548,21 @@ static int usage(FILE *fp, int n_files, int reads_store) { return 1; } -char * generate_prefix() { +char *generate_prefix(const char *out_fn) { char *prefix; unsigned int pid = getpid(); + + if (out_fn && !(*out_fn == '-' && out_fn[1] == '\0')) { + // ... + size_t plen = strlen(out_fn) + 50; + if (!(prefix = malloc(plen))) { + perror("collate"); + return NULL; + } + snprintf(prefix, plen, "%s.collate%x", out_fn, pid); + return prefix; + } + #ifdef _WIN32 # define PREFIX_LEN (MAX_PATH + 16) DWORD ret; @@ -567,13 +581,18 @@ char * generate_prefix() { snprintf(prefix + ret, PREFIX_LEN - ret, "\\%x", pid); return prefix; #else -# define PREFIX_LEN 64 - prefix = malloc(PREFIX_LEN); + char *tmp_env = getenv("TMPDIR"); + if (!tmp_env) + tmp_env = "/tmp"; + + size_t prefix_len = strlen(tmp_env)+20; + prefix = malloc(prefix_len); if (!prefix) { perror("collate"); return NULL; } - snprintf(prefix, PREFIX_LEN, "/tmp/collate%x", pid); + snprintf(prefix, prefix_len, "%s/collate%x", tmp_env, pid); + return prefix; #endif } @@ -590,7 +609,7 @@ int main_bamshuf(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:T:", lopts, NULL)) >= 0) { switch (c) { case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; @@ -599,6 +618,7 @@ int main_bamshuf(int argc, char *argv[]) case 'o': output_file = optarg; break; case 'f': fast_coll = 1; break; case 'r': reads_store = atoi(optarg); break; + case 'T': prefix = optarg; break; case 1: no_pg = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ @@ -607,6 +627,12 @@ int main_bamshuf(int argc, char *argv[]) } if (is_un) clevel = 0; if (argc >= optind + 2) prefix = argv[optind+1]; + if (argc == optind) { + if (argc > 1 || !isatty(STDIN_FILENO)) + fprintf(stderr, "collate: no input filename specified.\n"); + return usage(argc > 1 || !isatty(STDIN_FILENO) ? stderr : stdout, + n_files, reads_store); + } if (!(prefix || is_stdout || output_file)) return usage(stderr, n_files, reads_store); if (is_stdout && output_file) { @@ -614,7 +640,7 @@ int main_bamshuf(int argc, char *argv[]) return usage(stderr, n_files, reads_store); } if (!prefix) { - prefix = generate_prefix(); + prefix = generate_prefix(output_file); pre_mem = 1; } diff --git a/samtools/bamshuf.c.pysam.c b/samtools/bamshuf.c.pysam.c index d075a93..6547b3c 100644 --- a/samtools/bamshuf.c.pysam.c +++ b/samtools/bamshuf.c.pysam.c @@ -3,7 +3,7 @@ /* bamshuf.c -- collate subcommand. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013, 2015-2019 Genome Research Ltd. + Copyright (C) 2013, 2015-2019,2023 Genome Research Ltd. Author: Heng Li @@ -203,7 +203,7 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, // Read input, distribute reads pseudo-randomly into n_files temporary // files. - fp = sam_open_format(fn, "r", &ga->in); + fp = sam_open_format(fn ? fn : "-", "r", &ga->in); if (fp == NULL) { print_error_errno("collate", "Cannot open input file \"%s\"", fn); return 1; @@ -529,15 +529,17 @@ static int bamshuf(const char *fn, int n_files, const char *pre, int clevel, static int usage(FILE *fp, int n_files, int reads_store) { fprintf(fp, - "Usage: samtools collate [-Ou] [-o ] [-n nFiles] [-l cLevel] []\n\n" + "Usage: samtools collate [options...] []\n\n" "Options:\n" - " -O output to samtools_stdout\n" - " -o output file name (use prefix if not set)\n" - " -u uncompressed BAM output\n" - " -f fast (only primary alignments)\n" - " -r working reads stored (with -f) [%d]\n" // reads_store - " -l INT compression level [%d]\n" // DEF_CLEVEL - " -n INT number of temporary files [%d]\n" // n_files + " -O Output to samtools_stdout\n" + " -o Output file name (use prefix if not set)\n" + " -u Uncompressed BAM output\n" + " -f Fast (only primary alignments)\n" + " -r Working reads stored (with -f) [%d]\n" // reads_store + " -l INT Compression level [%d]\n" // DEF_CLEVEL + " -n INT Number of temporary files [%d]\n" // n_files + " -T PREFIX\n" + " Write tempory files to PREFIX.nnnn.bam\n" " --no-PG do not add a PG line\n", reads_store, DEF_CLEVEL, n_files); @@ -548,9 +550,21 @@ static int usage(FILE *fp, int n_files, int reads_store) { return 1; } -char * generate_prefix() { +char *generate_prefix(const char *out_fn) { char *prefix; unsigned int pid = getpid(); + + if (out_fn && !(*out_fn == '-' && out_fn[1] == '\0')) { + // ... + size_t plen = strlen(out_fn) + 50; + if (!(prefix = malloc(plen))) { + perror("collate"); + return NULL; + } + snprintf(prefix, plen, "%s.collate%x", out_fn, pid); + return prefix; + } + #ifdef _WIN32 # define PREFIX_LEN (MAX_PATH + 16) DWORD ret; @@ -569,13 +583,18 @@ char * generate_prefix() { snprintf(prefix + ret, PREFIX_LEN - ret, "\\%x", pid); return prefix; #else -# define PREFIX_LEN 64 - prefix = malloc(PREFIX_LEN); + char *tmp_env = getenv("TMPDIR"); + if (!tmp_env) + tmp_env = "/tmp"; + + size_t prefix_len = strlen(tmp_env)+20; + prefix = malloc(prefix_len); if (!prefix) { perror("collate"); return NULL; } - snprintf(prefix, PREFIX_LEN, "/tmp/collate%x", pid); + snprintf(prefix, prefix_len, "%s/collate%x", tmp_env, pid); + return prefix; #endif } @@ -592,7 +611,7 @@ int main_bamshuf(int argc, char *argv[]) { NULL, 0, NULL, 0 } }; - while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:", lopts, NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:T:", lopts, NULL)) >= 0) { switch (c) { case 'n': n_files = atoi(optarg); break; case 'l': clevel = atoi(optarg); break; @@ -601,6 +620,7 @@ int main_bamshuf(int argc, char *argv[]) case 'o': output_file = optarg; break; case 'f': fast_coll = 1; break; case 'r': reads_store = atoi(optarg); break; + case 'T': prefix = optarg; break; case 1: no_pg = 1; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ @@ -609,6 +629,12 @@ int main_bamshuf(int argc, char *argv[]) } if (is_un) clevel = 0; if (argc >= optind + 2) prefix = argv[optind+1]; + if (argc == optind) { + if (argc > 1 || !isatty(STDIN_FILENO)) + fprintf(samtools_stderr, "collate: no input filename specified.\n"); + return usage(argc > 1 || !isatty(STDIN_FILENO) ? samtools_stderr : samtools_stdout, + n_files, reads_store); + } if (!(prefix || is_stdout || output_file)) return usage(samtools_stderr, n_files, reads_store); if (is_stdout && output_file) { @@ -616,7 +642,7 @@ int main_bamshuf(int argc, char *argv[]) return usage(samtools_stderr, n_files, reads_store); } if (!prefix) { - prefix = generate_prefix(); + prefix = generate_prefix(output_file); pre_mem = 1; } diff --git a/samtools/bamtk.c b/samtools/bamtk.c index e690c1d..e05ea18 100644 --- a/samtools/bamtk.c +++ b/samtools/bamtk.c @@ -1,6 +1,6 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Author: Heng Li @@ -72,6 +72,8 @@ int main_import(int argc, char *argv[]); int main_samples(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); int main_reference(int argc, char *argv[]); +int main_reset(int argc, char *argv[]); +int main_cram_size(int argc, char *argv[]); const char *samtools_version() { @@ -101,7 +103,7 @@ const char *samtools_feature_string(void) { static void long_version(void) { printf("samtools %s\n" "Using htslib %s\n" - "Copyright (C) 2022 Genome Research Ltd.\n", + "Copyright (C) 2023 Genome Research Ltd.\n", samtools_version(), hts_version()); printf("\nSamtools compilation details:\n"); @@ -181,6 +183,7 @@ static void usage(FILE *fp) " fasta converts a BAM to a FASTA\n" " import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" " reference Generates a reference from aligned data\n" +" reset Reverts aligner changes in reads\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -188,6 +191,7 @@ static void usage(FILE *fp) " depth compute the depth\n" " flagstat simple stats\n" " idxstats BAM index stats\n" +" cram-size list CRAM Content-ID and Data-Series sizes\n" " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" " ampliconstats generate amplicon specific stats\n" @@ -287,12 +291,14 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1); + else if (strcmp(argv[1], "cram-size") == 0) ret = main_cram_size(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) long_version(); else if (strcmp(argv[1], "--version-only") == 0) { printf("%s+htslib-%s\n", samtools_version(), hts_version()); } + else if (strcmp(argv[1], "reset") == 0) ret = main_reset(argc-1, argv+1); else { fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/samtools/bamtk.c.pysam.c b/samtools/bamtk.c.pysam.c index b798658..d95ec05 100644 --- a/samtools/bamtk.c.pysam.c +++ b/samtools/bamtk.c.pysam.c @@ -2,7 +2,7 @@ /* bamtk.c -- main samtools command front-end. - Copyright (C) 2008-2022 Genome Research Ltd. + Copyright (C) 2008-2023 Genome Research Ltd. Author: Heng Li @@ -75,6 +75,8 @@ int main_import(int argc, char *argv[]); int main_samples(int argc, char *argv[]); int main_consensus(int argc, char *argv[]); int main_reference(int argc, char *argv[]); +int main_reset(int argc, char *argv[]); +int main_cram_size(int argc, char *argv[]); const char *samtools_version() { @@ -104,7 +106,7 @@ const char *samtools_feature_string(void) { static void long_version(void) { fprintf(samtools_stdout, "samtools %s\n" "Using htslib %s\n" - "Copyright (C) 2022 Genome Research Ltd.\n", + "Copyright (C) 2023 Genome Research Ltd.\n", samtools_version(), hts_version()); fprintf(samtools_stdout, "\nSamtools compilation details:\n"); @@ -184,6 +186,7 @@ static void usage(FILE *fp) " fasta converts a BAM to a FASTA\n" " import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n" " reference Generates a reference from aligned data\n" +" reset Reverts aligner changes in reads\n" "\n" " -- Statistics\n" " bedcov read depth per BED region\n" @@ -191,6 +194,7 @@ static void usage(FILE *fp) " depth compute the depth\n" " flagstat simple stats\n" " idxstats BAM index stats\n" +" cram-size list CRAM Content-ID and Data-Series sizes\n" " phase phase heterozygotes\n" " stats generate stats (former bamcheck)\n" " ampliconstats generate amplicon specific stats\n" @@ -290,12 +294,14 @@ int samtools_main(int argc, char *argv[]) else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1); else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1); else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1); + else if (strcmp(argv[1], "cram-size") == 0) ret = main_cram_size(argc-1, argv+1); else if (strcmp(argv[1], "version") == 0 || \ strcmp(argv[1], "--version") == 0) long_version(); else if (strcmp(argv[1], "--version-only") == 0) { fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version()); } + else if (strcmp(argv[1], "reset") == 0) ret = main_reset(argc-1, argv+1); else { fprintf(samtools_stderr, "[main] unrecognized command '%s'\n", argv[1]); return 1; diff --git a/samtools/consensus_pileup.c b/samtools/consensus_pileup.c index 935cbdc..b48aac2 100644 --- a/samtools/consensus_pileup.c +++ b/samtools/consensus_pileup.c @@ -186,7 +186,7 @@ static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { p->base4 = 16; p->padding = 1; if (p->seq_offset < b->core.l_qseq) - p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]); else p->qual = 0; } else { @@ -196,9 +196,9 @@ static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { p->base = '*'; p->base4 = 16; if (p->seq_offset+1 < b->core.l_qseq) - p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]); else - p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset]); break; case BAM_CPAD: @@ -206,9 +206,9 @@ static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { p->base = '*'; p->base4 = 16; if (p->seq_offset+1 < b->core.l_qseq) - p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]); else - p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset]); break; case BAM_CREF_SKIP: @@ -312,14 +312,18 @@ int pileup_loop(samFile *fp, samFile *fp, sam_hdr_t *h, pileup_t *p), - int (*seq_add)(void *client_data, - samFile *fp, - sam_hdr_t *h, - pileup_t *p, - int depth, - hts_pos_t pos, - int nth, - int is_insert), + int (*seq_column)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert), + void (*seq_free)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p), void *client_data) { int ret = -1; pileup_t *phead = NULL, *p, *pfree = NULL, *last, *next, *ptail = NULL; @@ -429,13 +433,13 @@ int pileup_loop(samFile *fp, ptail = phead; /* Call our function on phead linked list */ - v = seq_add(client_data, fp, h, phead, depth, + v = seq_column(client_data, fp, h, phead, depth, #ifdef START_WITH_DEL - col-1, + col-1, #else - col, + col, #endif - nth, is_insert); + nth, is_insert); /* Remove dead seqs */ for (p = eof_head ; p; p = p->eofn) { @@ -446,6 +450,9 @@ int pileup_loop(samFile *fp, p->next = pfree; pfree = p; + + if (seq_free) + seq_free(client_data, fp, h, p); } if (v == 1) @@ -587,6 +594,8 @@ int pileup_loop(samFile *fp, /* Tidy up */ for (p = pfree; p; p = next) { next = p->next; + if (seq_free) + seq_free(client_data, fp, h, p); free(p->b.data); free(p); } diff --git a/samtools/consensus_pileup.c.pysam.c b/samtools/consensus_pileup.c.pysam.c index dde0ad0..99fb957 100644 --- a/samtools/consensus_pileup.c.pysam.c +++ b/samtools/consensus_pileup.c.pysam.c @@ -188,7 +188,7 @@ static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { p->base4 = 16; p->padding = 1; if (p->seq_offset < b->core.l_qseq) - p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]); else p->qual = 0; } else { @@ -198,9 +198,9 @@ static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { p->base = '*'; p->base4 = 16; if (p->seq_offset+1 < b->core.l_qseq) - p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]); else - p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset]); break; case BAM_CPAD: @@ -208,9 +208,9 @@ static int get_next_base(pileup_t *p, hts_pos_t pos, int nth, int *is_insert) { p->base = '*'; p->base4 = 16; if (p->seq_offset+1 < b->core.l_qseq) - p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]); else - p->qual = (p->qual + p->b_qual[p->seq_offset])/2; + p->qual = MIN(p->qual, p->b_qual[p->seq_offset]); break; case BAM_CREF_SKIP: @@ -314,14 +314,18 @@ int pileup_loop(samFile *fp, samFile *fp, sam_hdr_t *h, pileup_t *p), - int (*seq_add)(void *client_data, - samFile *fp, - sam_hdr_t *h, - pileup_t *p, - int depth, - hts_pos_t pos, - int nth, - int is_insert), + int (*seq_column)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert), + void (*seq_free)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p), void *client_data) { int ret = -1; pileup_t *phead = NULL, *p, *pfree = NULL, *last, *next, *ptail = NULL; @@ -431,13 +435,13 @@ int pileup_loop(samFile *fp, ptail = phead; /* Call our function on phead linked list */ - v = seq_add(client_data, fp, h, phead, depth, + v = seq_column(client_data, fp, h, phead, depth, #ifdef START_WITH_DEL - col-1, + col-1, #else - col, + col, #endif - nth, is_insert); + nth, is_insert); /* Remove dead seqs */ for (p = eof_head ; p; p = p->eofn) { @@ -448,6 +452,9 @@ int pileup_loop(samFile *fp, p->next = pfree; pfree = p; + + if (seq_free) + seq_free(client_data, fp, h, p); } if (v == 1) @@ -589,6 +596,8 @@ int pileup_loop(samFile *fp, /* Tidy up */ for (p = pfree; p; p = next) { next = p->next; + if (seq_free) + seq_free(client_data, fp, h, p); free(p->b.data); free(p); } diff --git a/samtools/consensus_pileup.h b/samtools/consensus_pileup.h index 6eafdbb..7aacfaa 100644 --- a/samtools/consensus_pileup.h +++ b/samtools/consensus_pileup.h @@ -58,6 +58,25 @@ typedef struct pileup { bam1_t b; // Bam entry associated with struct } pileup_t; +/* + * The pileup loop executes and calls callbacks to perform the work. + * + * seq_fetch returns the next sequence. Return 0 from this indicates no + * more data. + * + * seq_init is called, if non-NULL, when a sequence is added to the pileup, + * seq_free likewise, if non-NULL, is called when a sequence is removed + * from the pileup. + * These two functions are akin to the constructor and destructors added + * to mpileup. + * + * seq_column is the primary work horse which is executed for each + * reference position, and for each inserted base per ref pos. + * + * If we were to invert this from a loop generating callbacks to a polled + * style interface like mpileup, then the seq_column bit would be dropped + * and replaced by the returned pileup and associated parameters. + */ int pileup_loop(samFile *fp, sam_hdr_t *h, int (*seq_fetch)(void *client_data, @@ -68,12 +87,16 @@ int pileup_loop(samFile *fp, samFile *fp, sam_hdr_t *h, pileup_t *p), - int (*seq_add)(void *client_data, - samFile *fp, - sam_hdr_t *h, - pileup_t *p, - int depth, - hts_pos_t pos, - int nth, - int is_insert), + int (*seq_column)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p, + int depth, + hts_pos_t pos, + int nth, + int is_insert), + void (*seq_free)(void *client_data, + samFile *fp, + sam_hdr_t *h, + pileup_t *p), void *client_data); diff --git a/samtools/cram_size.c b/samtools/cram_size.c new file mode 100644 index 0000000..6c397bc --- /dev/null +++ b/samtools/cram_size.c @@ -0,0 +1,665 @@ +/* cram_size.c -- produces summary of the size of each cram data-series + + Copyright (C) 2023 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +// TODO: add range query. Eg the ability to look at size for "*" only +// (unmapped), or in a specific region such as a centromere. + +#include + +#include +#include +#include +#include +#include + +#include "htslib/bgzf.h" +#include "htslib/sam.h" +#include "htslib/cram.h" +#include "htslib/kstring.h" +#include "htslib/khash.h" +#include "samtools.h" +#include "sam_opts.h" +#include "htslib/hfile.h" + +/*---------------------------------------------------------------------- + * Compression method handling + */ + +// A numeric version of the cram_method_details struct. +// We expand the myriad of struct field combinations into a single +// enumerated type so we can index and accumulate statistics for +// purposes of reporting. +// +// These expanded numeric values have no definition within CRAM itself +// and never occur within the file format. +enum comp_expanded { + //---- + // Copies from htslib cram_block_method enum + COMP_RAW = CRAM_COMP_RAW, + COMP_GZIP = CRAM_COMP_GZIP, + COMP_BZIP2 = CRAM_COMP_BZIP2, + COMP_LZMA = CRAM_COMP_LZMA, + COMP_RANS8 = CRAM_COMP_RANS4x8, + COMP_RANS16 = CRAM_COMP_RANSNx16, + COMP_ARITH = CRAM_COMP_ARITH, + COMP_FQZ = CRAM_COMP_FQZ, + COMP_TOK3 = CRAM_COMP_TOK3, + + //---- + // Localised variants. + + // Gzip + COMP_GZIP_1, + COMP_GZIP_9, + + // Bzip2 + COMP_BZIP2_1, + COMP_BZIP2_2, + COMP_BZIP2_3, + COMP_BZIP2_4, + COMP_BZIP2_5, + COMP_BZIP2_6, + COMP_BZIP2_7, + COMP_BZIP2_8, + COMP_BZIP2_9, + + // rans 4x8 + COMP_RANS4x8_O0, + COMP_RANS4x8_O1, + + // rans Nx16. Note order here is to enable selection via bit-fields + // bit 0: O0/O1 + // bit 1: RLE + // bit 2: PACK + // bit 3: 32x16 + COMP_RANS4x16_O0, + COMP_RANS4x16_O1, + COMP_RANS4x16_O0R, // +RLE + COMP_RANS4x16_O1R, + COMP_RANS4x16_O0P, // +PACK + COMP_RANS4x16_O1P, + COMP_RANS4x16_O0PR, // +PACK+RLE + COMP_RANS4x16_O1PR, + COMP_RANS32x16_O0, // SIMD variants + COMP_RANS32x16_O1, + COMP_RANS32x16_O0R, // +RLE + COMP_RANS32x16_O1R, + COMP_RANS32x16_O0P, // +PACK + COMP_RANS32x16_O1P, + COMP_RANS32x16_O0PR, // +PACK+RLE + COMP_RANS32x16_O1PR, + COMP_RANSNx16_STRIPE, + COMP_RANSNx16_CAT, + + // Arith + COMP_ARITH_O0, + COMP_ARITH_O1, + COMP_ARITH_O0R, // +RLE + COMP_ARITH_O1R, + COMP_ARITH_O0P, // +PACK + COMP_ARITH_O1P, + COMP_ARITH_O0PR, // +PACK+RLE + COMP_ARITH_O1PR, + COMP_ARITH_STRIPE, + COMP_ARITH_CAT, // no entropy encoder + COMP_ARITH_EXT, // external entropy encode + + // Nake tokeniser + COMP_TOK3_RANS, + COMP_TOK3_ARITH, + + // To mark maximum size + COMP_MAX, +}; + +static enum comp_expanded comp_method2expanded(cram_method_details *cm) { + switch (cm->method) { + case CRAM_COMP_GZIP: + switch (cm->level) { + case 1: return COMP_GZIP_1; + case 9: return COMP_GZIP_9; + default: return COMP_GZIP; + } + break; + + case CRAM_COMP_BZIP2: + if (cm->level >= 1 && cm->level <= 9) + return COMP_BZIP2_1 + cm->level-1; + else + return COMP_BZIP2; + break; + + case CRAM_COMP_RANS4x8: + return cm->order ? COMP_RANS4x8_O1 : COMP_RANS4x8_O0; + + case CRAM_COMP_RANSNx16: { + // 8 4x16, 8 32x16 and 2 stripe/cat + if (cm->stripe) return COMP_RANSNx16_STRIPE; + if (cm->cat) return COMP_RANSNx16_CAT; + int c = COMP_RANS4x16_O0; + c += 1*cm->order; + c += 2*cm->rle; + c += 4*cm->pack; + c += 8*(cm->Nway==32); + return c; + } + + case CRAM_COMP_ARITH: { + // 8 4x16, 8 32x16 and 2 stripe/cat + if (cm->stripe) return COMP_ARITH_STRIPE; + if (cm->cat) return COMP_ARITH_CAT; + if (cm->ext) return COMP_ARITH_EXT; + int c = COMP_ARITH_O0; + c += 1*cm->order; + c += 2*cm->rle; + c += 4*cm->pack; + return c; + } + + case CRAM_COMP_TOK3: + return cm->level < 10 + ? COMP_TOK3_RANS + : COMP_TOK3_ARITH; + + default: + // Any unspecialised method + return (enum comp_expanded)cm->method; + } +} + +// Short form of cram_block_method_int type +static char comp_method2char[COMP_MAX] = + ".gblr0afn" // standard CRAM methods + "_G" // gzip + "bbbbbbbbB" // bzip2 + "rR" // rans4x8 + "010101014545454582" // ransNx16 + "aAaAaAaAaaa" // arith + "nN"; // tok3 + +// Long form of cram_block_method_int type +static char *comp_method2str[COMP_MAX] = { + // Standard CRAM methods + "raw", "gzip", "bzip2", "lzma", "r4x8", "rNx16", + "arith", "fqzcomp", "tok3", + + // custom gzip + "gzip-min", "gzip-max", + + // custom bzip2 + "bzip2-1", "bzip2-2", "bzip2-3", "bzip2-4", "bzip2-5", + "bzip2-6", "bzip2-7", "bzip2-8", "bzip2-9", + + // rANS 4x8 + "r4x8-o0", "r4x8-o1", + + // rANS 4x16 + "r4x16-o0", "r4x16-o1", + + "r4x16-o0R", "r4x16-o1R", + "r4x16-o0P", "r4x16-o1P", + "r4x16-o0PR", "r4x16-o1PR", + "r32x16-o0", "r32x16-o1", + "r32x16-o0R", "r32x16-o1R", + "r32x16-o0P", "r32x16-o1P", + "r32x16-o0PR","r32x16-o1PR", + "rNx16-xo0", "rNx16-cat", + + // Arith + "arith-o0", "arith-o1", + "arith-o0R", "arith-o1R", + "arith-o0P", "arith-o1P", + "arith-o0PR", "arith-o1PR", + "arith-stripe", "arith-cat", "arith-ext", + + // Name tokeniser + "tok3-rans", "tok3-arith", +}; + +/*---------------------------------------------------------------------- + * Manipulation and sorting of Block Content-ID arrays and hashes + */ + +typedef struct { + int64_t csize[COMP_MAX]; + int64_t usize[COMP_MAX]; +} cusize_t; + +static int64_t total_csize(cusize_t *cu) { + int i; + int64_t tot = 0; + for (i = 0; i < COMP_MAX; i++) + tot += cu->csize[i]; + return tot; +} + +static int64_t total_usize(cusize_t *cu) { + int i; + int64_t tot = 0; + for (i = 0; i < COMP_MAX; i++) + tot += cu->usize[i]; + return tot; +} + +// cusize_t array and sorting by compressed size +static cusize_t *sort_cusize_global; // avoids a messy extra data type +static int sort_cusize_compar(const void *i1, const void *i2) { + int64_t n = sort_cusize_global->csize[*(const int *)i2] - + sort_cusize_global->csize[*(const int *)i1]; + return n > 0 ? 1 : (n < 0 ? -1 : *(const int *)i1 - *(const int *)i2); +} + +// Sort a cusize array by size of used method. +// Returns cu->csize[comp] indices in descending size, as static mem +static int *sort_cusize(cusize_t *cu) { + static int idx[COMP_MAX]; + int i; + for (i = 0; i < COMP_MAX; i++) + idx[i] = i; + sort_cusize_global = cu; + qsort(idx, COMP_MAX, sizeof(*idx), sort_cusize_compar); + + return idx; +} + +// Hash table of cusize_t and sorting by key (content-id) +KHASH_MAP_INIT_INT(cu, cusize_t) + +/* Sort by hash key. Global due to rubbish qsort API, but it's simple. */ +static khash_t(cu) *global_cu_hash = NULL; +static int cu_compar(const void *i1, const void *i2) { + return kh_key(global_cu_hash, *(const int *)i1) - + kh_key(global_cu_hash, *(const int *)i2); +} + +/*---------------------------------------------------------------------- + * Main cram_size reporting and aggregation + */ +static off_t report_size(FILE *outfp, int verbose, int ref_seq_blk, + khash_t(cu) *cu_size, cram_cid2ds_t *cid2ds) { + if (!cu_size || !cid2ds) + return -1; + + khiter_t k; + off_t tot_size = 0; + + fprintf(outfp, "# Content_ID Uncomp.size Comp.size Ratio Method%.*s Data_series\n", verbose ? 4 : 0, " "); + int *sorted_blocks = malloc(kh_end(cu_size)*sizeof(int)); + if (!sorted_blocks) + return -1; + int nblocks = 0; + for (k = kh_begin(cu_size); k != kh_end(cu_size); k++) { + if (!kh_exist(cu_size, k)) + continue; + sorted_blocks[nblocks++] = k; + } + global_cu_hash = cu_size; + qsort(sorted_blocks, nblocks, sizeof(int), cu_compar); + + int i; + for (i = 0; i < nblocks; i++) { + k = sorted_blocks[i]; + + if (verbose) { + // FULL output + int *comp_idx = sort_cusize(&kh_value(cu_size, k)); + int first_line = 1, c, j; + for (c = 0; c < COMP_MAX; c++) { + int comp = comp_idx[c]; + if (!kh_value(cu_size, k).csize[comp] && c) + break; + + if (!first_line) + fprintf(outfp, "\n"); + first_line = 0; + + if ((int)kh_key(cu_size, k) < 0) + fprintf(outfp, "BLOCK %8s", "CORE"); + else + fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k)); + + fprintf(outfp, " %12"PRId64" %12"PRId64, + kh_value(cu_size, k).usize[comp], + kh_value(cu_size, k).csize[comp]); + double f = (100.0*(kh_value(cu_size, k).csize[comp]+.0001)) / + (kh_value(cu_size, k).usize[comp]+.0001); + if (f > 999) + fprintf(outfp, " >999%% %-11s", comp_method2str[comp]); + else + fprintf(outfp, " %6.2f%% %-11s",f, comp_method2str[comp]); + + int n, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n); + for (j = 0; j < n; j++) { + int d = dsa[j]; + if (d > 65535) + fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff); + else + fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff); + } + } + } else { + // aggregate by compression type. + int64_t csize = total_csize(&kh_value(cu_size, k)); + int64_t usize = total_usize(&kh_value(cu_size, k)); + int *comp_idx = sort_cusize(&kh_value(cu_size, k)); + + char cstr[COMP_MAX+1] = {0}; + int cidx = 0, c; + for (c = 0; c < COMP_MAX; c++) { + if (!kh_value(cu_size, k).csize[comp_idx[c]]) + break; + cstr[cidx++] = comp_method2char[comp_idx[c]]; + } + if (!*cstr) *cstr = '.'; + + if ((int)kh_key(cu_size, k) < 0) + fprintf(outfp, "BLOCK %8s", "CORE"); + else + fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k)); + fprintf(outfp, " %12"PRId64" %12"PRId64, usize, csize); + double f = 100*(csize+.0001)/(usize+.0001); + if (f > 999) + fprintf(outfp, " >999%% %-7s", cstr); + else + fprintf(outfp, " %6.2f%% %-7s", f, cstr); + + int n, j, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n); + for (j = 0; j < n; j++) { + int d = dsa[j]; + if (d > 65535) + fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff); + else + fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff); + } + } + + if ((int)kh_key(cu_size, k) >= 0 && + (int)kh_key(cu_size, k) == ref_seq_blk) { + fprintf(outfp, " embedded_ref"); + } + fprintf(outfp, "\n"); + + tot_size += total_csize(&kh_value(cu_size, k)); + } + + free(sorted_blocks); + + return tot_size; +} + +/* Main processing loop */ +static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp, + int verbose, int encodings) { + cram_fd *in_c; + cram_container *c = NULL; + cram_block *blk = NULL; + cram_block_slice_hdr *shdr = NULL; + khiter_t k; + int ret; + cram_cid2ds_t *cid2ds = NULL; + khash_t(cu) *cu_size = kh_init(cu); + int ref_seq_blk_used = -1; + int64_t nseqs = 0, nbases = 0, ncont = 0, nslice = 0; + + if (!in->is_cram) { + print_error("cram_size", "Input is not a CRAM file"); + goto err; + } + in_c = in->fp.cram; // low level htslib abuse? + while ((c = cram_read_container(in_c))) { + if (cram_container_is_empty(in_c)) { + cram_block *blk; + // Container compression header + if (!(blk = cram_read_block(in_c))) + goto err; + cram_free_block(blk); + cram_free_container(c); + c = NULL; blk = NULL; + continue; + } + + nseqs += cram_container_get_num_records(c); + nbases += cram_container_get_num_bases(c); + + // Container compression header + int32_t num_slices; + if (!(blk = cram_read_block(in_c))) + goto err; + + // Decode compression header... + cram_block_compression_hdr *chdr; + chdr = cram_decode_compression_header(in_c, blk); + + if (encodings) { + kstring_t ks = KS_INITIALIZE; + if (cram_describe_encodings(chdr, &ks) < 0) + goto err; + + fprintf(outfp, "Container encodings\n%s\n", ks_str(&ks)); + + ks_free(&ks); + } + + cid2ds = cram_update_cid2ds_map(chdr, cid2ds); + + cram_free_block(blk); + blk = NULL; + + cram_free_compression_header(chdr); + + // Container num_blocks can be invalid, due to a bug. + // Instead we iterate in slice context instead. + (void)cram_container_get_landmarks(c, &num_slices); + ncont++; + nslice += num_slices; + + int i, j; + for (i = 0; i < num_slices; i++) { + // Slice header + if (!(blk = cram_read_block(in_c))) + goto err; + if (!(shdr = cram_decode_slice_header(in_c, blk))) + goto err; + cram_free_block(blk); + blk = NULL; + + int ref_seq_blk = cram_slice_hdr_get_embed_ref_id(shdr); + int num_blocks = cram_slice_hdr_get_num_blocks(shdr); + + // Embedded reference. Check it's consistent (if used this is + // an almost guaranteed certainty, so we take the easy route). + if (ref_seq_blk >= 0) { + if (ref_seq_blk_used == -1) + ref_seq_blk_used = ref_seq_blk; + else if (ref_seq_blk_used != ref_seq_blk) + fprintf(stderr, "Embedded reference is not consistently using the same Content-Id.\n" + "Reported figures for reference will be invalid.\n"); + } + + // Slice data blocks + for (j = 0; j < num_blocks; j++) { + // read and discard, unless it's the ref-ID block + if (!(blk = cram_read_block(in_c))) + goto err; + + int32_t csize = cram_block_get_comp_size(blk); + int32_t usize = cram_block_get_uncomp_size(blk); + int cid = cram_block_get_content_id(blk); + enum cram_block_method method = cram_block_get_method(blk); + + // Expand comp to the internal sub-formats, eg + // rANS order-0/1, PACK+RLE, etc. + cram_method_details *cm; + cm = cram_expand_method(cram_block_get_data(blk), + cram_block_get_comp_size(blk), + method); + if (!cm) + goto err; + enum comp_expanded comp + = comp_method2expanded(cm); + free(cm); + + k = kh_put(cu, cu_size, cid, &ret); + if (ret < 0) + goto err; + if (ret == 0) { + kh_value(cu_size, k).csize[comp] += csize; + kh_value(cu_size, k).usize[comp] += usize; + } else { + memset(&kh_value(cu_size, k), 0, sizeof(cusize_t)); + kh_value(cu_size, k).csize[comp] = csize; + kh_value(cu_size, k).usize[comp] = usize; + } + + cram_free_block(blk); + blk = NULL; + } + cram_free_slice_header(shdr); + shdr = NULL; + } + + cram_free_container(c); + c = NULL; + } + + off_t tot_size = report_size(outfp, verbose, ref_seq_blk_used, + cu_size, cid2ds); + if (tot_size < 0) + goto err; + + kh_destroy(cu, cu_size); + cram_cid2ds_free(cid2ds); + + off_t end = htell(hf_in); + + fprintf(outfp, "\n"); + fprintf(outfp, "Number of containers %18"PRId64"\n", ncont); + fprintf(outfp, "Number of slices %18"PRId64"\n", nslice); + fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs); + fprintf(outfp, "Number of bases %18"PRId64"\n", nbases); + fprintf(outfp, "Total file size %18"PRId64"\n", end); + fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size); + + return 0; + + err: + // Report anyway so we can get stats on partial files, but be + // sure to error too. + report_size(outfp, verbose, ref_seq_blk_used, cu_size, cid2ds); + + print_error("cram_size", "Failed in decoding CRAM file"); + if (blk) + cram_free_block(blk); + if (shdr) + cram_free_slice_header(shdr); + if (c) + cram_free_container(c); + if (cid2ds) + cram_cid2ds_free(cid2ds); + + return -1; +} + +/* main() for cram_size */ +int main_cram_size(int argc, char *argv[]) { + int c, usage = 0, verbose = 0, encodings = 0; + sam_hdr_t *h = 0; + hFILE *hf_in = NULL; + samFile *in = NULL; + sam_global_args ga; + FILE *outfp = stdout; + + static const struct option lopts[] = { + {"output", required_argument, NULL, 'o'}, + {"verbose", no_argument, NULL, 'v'}, + {"encodings", no_argument, NULL, 'e'}, + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '-'), + { NULL, 0, NULL, 0 } + }; + + sam_global_args_init(&ga); + + while ((c = getopt_long(argc, argv, "vo:e", lopts, NULL)) >= 0) { + switch (c) { + case 'o': + if (!(outfp = fopen(optarg, "w"))) { + perror(optarg); + goto err; + } + break; + + case 'v': + verbose++; + break; + + case 'e': + encodings++; + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; + } + } + + if ((optind == argc && isatty(0)) || usage) { + printf("Usage: samtools cram_size [-ve] [-o out.size] [in.cram]\n"); + return 0; + } + + char *fn = optind < argc ? argv[optind] : "-"; + + // We want access to in->fp.cram->fp, but this is an opaque struct so we + // can't get that. However we opened with hopen and then reopen as + // CRAM with hts_hopen, which will swallow the initial hFILE and take + // owenership of it. Hence we now know in->fp.cram->fp. + if (!(hf_in = hopen(fn, "r"))) { + print_error_errno("cram_size", "failed to open file '%s'", fn); + return 1; + } + if (!(in = hts_hopen(hf_in, fn, "r"))) { + print_error_errno("cram_size", "failed to open file '%s'", fn); + goto err; + } + + if (!(h = sam_hdr_read(in))) + goto err; + + int ret = cram_size(hf_in, in, h, outfp, verbose, encodings); + sam_hdr_destroy(h); + sam_close(in); + if (outfp != stdout) + fclose(outfp); + + return ret ? 1 : 0; + + err: + if (in) + sam_close(in); + if (h) + sam_hdr_destroy(h); + + return 1; +} diff --git a/samtools/cram_size.c.pysam.c b/samtools/cram_size.c.pysam.c new file mode 100644 index 0000000..f260419 --- /dev/null +++ b/samtools/cram_size.c.pysam.c @@ -0,0 +1,667 @@ +#include "samtools.pysam.h" + +/* cram_size.c -- produces summary of the size of each cram data-series + + Copyright (C) 2023 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +// TODO: add range query. Eg the ability to look at size for "*" only +// (unmapped), or in a specific region such as a centromere. + +#include + +#include +#include +#include +#include +#include + +#include "htslib/bgzf.h" +#include "htslib/sam.h" +#include "htslib/cram.h" +#include "htslib/kstring.h" +#include "htslib/khash.h" +#include "samtools.h" +#include "sam_opts.h" +#include "htslib/hfile.h" + +/*---------------------------------------------------------------------- + * Compression method handling + */ + +// A numeric version of the cram_method_details struct. +// We expand the myriad of struct field combinations into a single +// enumerated type so we can index and accumulate statistics for +// purposes of reporting. +// +// These expanded numeric values have no definition within CRAM itself +// and never occur within the file format. +enum comp_expanded { + //---- + // Copies from htslib cram_block_method enum + COMP_RAW = CRAM_COMP_RAW, + COMP_GZIP = CRAM_COMP_GZIP, + COMP_BZIP2 = CRAM_COMP_BZIP2, + COMP_LZMA = CRAM_COMP_LZMA, + COMP_RANS8 = CRAM_COMP_RANS4x8, + COMP_RANS16 = CRAM_COMP_RANSNx16, + COMP_ARITH = CRAM_COMP_ARITH, + COMP_FQZ = CRAM_COMP_FQZ, + COMP_TOK3 = CRAM_COMP_TOK3, + + //---- + // Localised variants. + + // Gzip + COMP_GZIP_1, + COMP_GZIP_9, + + // Bzip2 + COMP_BZIP2_1, + COMP_BZIP2_2, + COMP_BZIP2_3, + COMP_BZIP2_4, + COMP_BZIP2_5, + COMP_BZIP2_6, + COMP_BZIP2_7, + COMP_BZIP2_8, + COMP_BZIP2_9, + + // rans 4x8 + COMP_RANS4x8_O0, + COMP_RANS4x8_O1, + + // rans Nx16. Note order here is to enable selection via bit-fields + // bit 0: O0/O1 + // bit 1: RLE + // bit 2: PACK + // bit 3: 32x16 + COMP_RANS4x16_O0, + COMP_RANS4x16_O1, + COMP_RANS4x16_O0R, // +RLE + COMP_RANS4x16_O1R, + COMP_RANS4x16_O0P, // +PACK + COMP_RANS4x16_O1P, + COMP_RANS4x16_O0PR, // +PACK+RLE + COMP_RANS4x16_O1PR, + COMP_RANS32x16_O0, // SIMD variants + COMP_RANS32x16_O1, + COMP_RANS32x16_O0R, // +RLE + COMP_RANS32x16_O1R, + COMP_RANS32x16_O0P, // +PACK + COMP_RANS32x16_O1P, + COMP_RANS32x16_O0PR, // +PACK+RLE + COMP_RANS32x16_O1PR, + COMP_RANSNx16_STRIPE, + COMP_RANSNx16_CAT, + + // Arith + COMP_ARITH_O0, + COMP_ARITH_O1, + COMP_ARITH_O0R, // +RLE + COMP_ARITH_O1R, + COMP_ARITH_O0P, // +PACK + COMP_ARITH_O1P, + COMP_ARITH_O0PR, // +PACK+RLE + COMP_ARITH_O1PR, + COMP_ARITH_STRIPE, + COMP_ARITH_CAT, // no entropy encoder + COMP_ARITH_EXT, // external entropy encode + + // Nake tokeniser + COMP_TOK3_RANS, + COMP_TOK3_ARITH, + + // To mark maximum size + COMP_MAX, +}; + +static enum comp_expanded comp_method2expanded(cram_method_details *cm) { + switch (cm->method) { + case CRAM_COMP_GZIP: + switch (cm->level) { + case 1: return COMP_GZIP_1; + case 9: return COMP_GZIP_9; + default: return COMP_GZIP; + } + break; + + case CRAM_COMP_BZIP2: + if (cm->level >= 1 && cm->level <= 9) + return COMP_BZIP2_1 + cm->level-1; + else + return COMP_BZIP2; + break; + + case CRAM_COMP_RANS4x8: + return cm->order ? COMP_RANS4x8_O1 : COMP_RANS4x8_O0; + + case CRAM_COMP_RANSNx16: { + // 8 4x16, 8 32x16 and 2 stripe/cat + if (cm->stripe) return COMP_RANSNx16_STRIPE; + if (cm->cat) return COMP_RANSNx16_CAT; + int c = COMP_RANS4x16_O0; + c += 1*cm->order; + c += 2*cm->rle; + c += 4*cm->pack; + c += 8*(cm->Nway==32); + return c; + } + + case CRAM_COMP_ARITH: { + // 8 4x16, 8 32x16 and 2 stripe/cat + if (cm->stripe) return COMP_ARITH_STRIPE; + if (cm->cat) return COMP_ARITH_CAT; + if (cm->ext) return COMP_ARITH_EXT; + int c = COMP_ARITH_O0; + c += 1*cm->order; + c += 2*cm->rle; + c += 4*cm->pack; + return c; + } + + case CRAM_COMP_TOK3: + return cm->level < 10 + ? COMP_TOK3_RANS + : COMP_TOK3_ARITH; + + default: + // Any unspecialised method + return (enum comp_expanded)cm->method; + } +} + +// Short form of cram_block_method_int type +static char comp_method2char[COMP_MAX] = + ".gblr0afn" // standard CRAM methods + "_G" // gzip + "bbbbbbbbB" // bzip2 + "rR" // rans4x8 + "010101014545454582" // ransNx16 + "aAaAaAaAaaa" // arith + "nN"; // tok3 + +// Long form of cram_block_method_int type +static char *comp_method2str[COMP_MAX] = { + // Standard CRAM methods + "raw", "gzip", "bzip2", "lzma", "r4x8", "rNx16", + "arith", "fqzcomp", "tok3", + + // custom gzip + "gzip-min", "gzip-max", + + // custom bzip2 + "bzip2-1", "bzip2-2", "bzip2-3", "bzip2-4", "bzip2-5", + "bzip2-6", "bzip2-7", "bzip2-8", "bzip2-9", + + // rANS 4x8 + "r4x8-o0", "r4x8-o1", + + // rANS 4x16 + "r4x16-o0", "r4x16-o1", + + "r4x16-o0R", "r4x16-o1R", + "r4x16-o0P", "r4x16-o1P", + "r4x16-o0PR", "r4x16-o1PR", + "r32x16-o0", "r32x16-o1", + "r32x16-o0R", "r32x16-o1R", + "r32x16-o0P", "r32x16-o1P", + "r32x16-o0PR","r32x16-o1PR", + "rNx16-xo0", "rNx16-cat", + + // Arith + "arith-o0", "arith-o1", + "arith-o0R", "arith-o1R", + "arith-o0P", "arith-o1P", + "arith-o0PR", "arith-o1PR", + "arith-stripe", "arith-cat", "arith-ext", + + // Name tokeniser + "tok3-rans", "tok3-arith", +}; + +/*---------------------------------------------------------------------- + * Manipulation and sorting of Block Content-ID arrays and hashes + */ + +typedef struct { + int64_t csize[COMP_MAX]; + int64_t usize[COMP_MAX]; +} cusize_t; + +static int64_t total_csize(cusize_t *cu) { + int i; + int64_t tot = 0; + for (i = 0; i < COMP_MAX; i++) + tot += cu->csize[i]; + return tot; +} + +static int64_t total_usize(cusize_t *cu) { + int i; + int64_t tot = 0; + for (i = 0; i < COMP_MAX; i++) + tot += cu->usize[i]; + return tot; +} + +// cusize_t array and sorting by compressed size +static cusize_t *sort_cusize_global; // avoids a messy extra data type +static int sort_cusize_compar(const void *i1, const void *i2) { + int64_t n = sort_cusize_global->csize[*(const int *)i2] - + sort_cusize_global->csize[*(const int *)i1]; + return n > 0 ? 1 : (n < 0 ? -1 : *(const int *)i1 - *(const int *)i2); +} + +// Sort a cusize array by size of used method. +// Returns cu->csize[comp] indices in descending size, as static mem +static int *sort_cusize(cusize_t *cu) { + static int idx[COMP_MAX]; + int i; + for (i = 0; i < COMP_MAX; i++) + idx[i] = i; + sort_cusize_global = cu; + qsort(idx, COMP_MAX, sizeof(*idx), sort_cusize_compar); + + return idx; +} + +// Hash table of cusize_t and sorting by key (content-id) +KHASH_MAP_INIT_INT(cu, cusize_t) + +/* Sort by hash key. Global due to rubbish qsort API, but it's simple. */ +static khash_t(cu) *global_cu_hash = NULL; +static int cu_compar(const void *i1, const void *i2) { + return kh_key(global_cu_hash, *(const int *)i1) - + kh_key(global_cu_hash, *(const int *)i2); +} + +/*---------------------------------------------------------------------- + * Main cram_size reporting and aggregation + */ +static off_t report_size(FILE *outfp, int verbose, int ref_seq_blk, + khash_t(cu) *cu_size, cram_cid2ds_t *cid2ds) { + if (!cu_size || !cid2ds) + return -1; + + khiter_t k; + off_t tot_size = 0; + + fprintf(outfp, "# Content_ID Uncomp.size Comp.size Ratio Method%.*s Data_series\n", verbose ? 4 : 0, " "); + int *sorted_blocks = malloc(kh_end(cu_size)*sizeof(int)); + if (!sorted_blocks) + return -1; + int nblocks = 0; + for (k = kh_begin(cu_size); k != kh_end(cu_size); k++) { + if (!kh_exist(cu_size, k)) + continue; + sorted_blocks[nblocks++] = k; + } + global_cu_hash = cu_size; + qsort(sorted_blocks, nblocks, sizeof(int), cu_compar); + + int i; + for (i = 0; i < nblocks; i++) { + k = sorted_blocks[i]; + + if (verbose) { + // FULL output + int *comp_idx = sort_cusize(&kh_value(cu_size, k)); + int first_line = 1, c, j; + for (c = 0; c < COMP_MAX; c++) { + int comp = comp_idx[c]; + if (!kh_value(cu_size, k).csize[comp] && c) + break; + + if (!first_line) + fprintf(outfp, "\n"); + first_line = 0; + + if ((int)kh_key(cu_size, k) < 0) + fprintf(outfp, "BLOCK %8s", "CORE"); + else + fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k)); + + fprintf(outfp, " %12"PRId64" %12"PRId64, + kh_value(cu_size, k).usize[comp], + kh_value(cu_size, k).csize[comp]); + double f = (100.0*(kh_value(cu_size, k).csize[comp]+.0001)) / + (kh_value(cu_size, k).usize[comp]+.0001); + if (f > 999) + fprintf(outfp, " >999%% %-11s", comp_method2str[comp]); + else + fprintf(outfp, " %6.2f%% %-11s",f, comp_method2str[comp]); + + int n, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n); + for (j = 0; j < n; j++) { + int d = dsa[j]; + if (d > 65535) + fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff); + else + fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff); + } + } + } else { + // aggregate by compression type. + int64_t csize = total_csize(&kh_value(cu_size, k)); + int64_t usize = total_usize(&kh_value(cu_size, k)); + int *comp_idx = sort_cusize(&kh_value(cu_size, k)); + + char cstr[COMP_MAX+1] = {0}; + int cidx = 0, c; + for (c = 0; c < COMP_MAX; c++) { + if (!kh_value(cu_size, k).csize[comp_idx[c]]) + break; + cstr[cidx++] = comp_method2char[comp_idx[c]]; + } + if (!*cstr) *cstr = '.'; + + if ((int)kh_key(cu_size, k) < 0) + fprintf(outfp, "BLOCK %8s", "CORE"); + else + fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k)); + fprintf(outfp, " %12"PRId64" %12"PRId64, usize, csize); + double f = 100*(csize+.0001)/(usize+.0001); + if (f > 999) + fprintf(outfp, " >999%% %-7s", cstr); + else + fprintf(outfp, " %6.2f%% %-7s", f, cstr); + + int n, j, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n); + for (j = 0; j < n; j++) { + int d = dsa[j]; + if (d > 65535) + fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff); + else + fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff); + } + } + + if ((int)kh_key(cu_size, k) >= 0 && + (int)kh_key(cu_size, k) == ref_seq_blk) { + fprintf(outfp, " embedded_ref"); + } + fprintf(outfp, "\n"); + + tot_size += total_csize(&kh_value(cu_size, k)); + } + + free(sorted_blocks); + + return tot_size; +} + +/* Main processing loop */ +static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp, + int verbose, int encodings) { + cram_fd *in_c; + cram_container *c = NULL; + cram_block *blk = NULL; + cram_block_slice_hdr *shdr = NULL; + khiter_t k; + int ret; + cram_cid2ds_t *cid2ds = NULL; + khash_t(cu) *cu_size = kh_init(cu); + int ref_seq_blk_used = -1; + int64_t nseqs = 0, nbases = 0, ncont = 0, nslice = 0; + + if (!in->is_cram) { + print_error("cram_size", "Input is not a CRAM file"); + goto err; + } + in_c = in->fp.cram; // low level htslib abuse? + while ((c = cram_read_container(in_c))) { + if (cram_container_is_empty(in_c)) { + cram_block *blk; + // Container compression header + if (!(blk = cram_read_block(in_c))) + goto err; + cram_free_block(blk); + cram_free_container(c); + c = NULL; blk = NULL; + continue; + } + + nseqs += cram_container_get_num_records(c); + nbases += cram_container_get_num_bases(c); + + // Container compression header + int32_t num_slices; + if (!(blk = cram_read_block(in_c))) + goto err; + + // Decode compression header... + cram_block_compression_hdr *chdr; + chdr = cram_decode_compression_header(in_c, blk); + + if (encodings) { + kstring_t ks = KS_INITIALIZE; + if (cram_describe_encodings(chdr, &ks) < 0) + goto err; + + fprintf(outfp, "Container encodings\n%s\n", ks_str(&ks)); + + ks_free(&ks); + } + + cid2ds = cram_update_cid2ds_map(chdr, cid2ds); + + cram_free_block(blk); + blk = NULL; + + cram_free_compression_header(chdr); + + // Container num_blocks can be invalid, due to a bug. + // Instead we iterate in slice context instead. + (void)cram_container_get_landmarks(c, &num_slices); + ncont++; + nslice += num_slices; + + int i, j; + for (i = 0; i < num_slices; i++) { + // Slice header + if (!(blk = cram_read_block(in_c))) + goto err; + if (!(shdr = cram_decode_slice_header(in_c, blk))) + goto err; + cram_free_block(blk); + blk = NULL; + + int ref_seq_blk = cram_slice_hdr_get_embed_ref_id(shdr); + int num_blocks = cram_slice_hdr_get_num_blocks(shdr); + + // Embedded reference. Check it's consistent (if used this is + // an almost guaranteed certainty, so we take the easy route). + if (ref_seq_blk >= 0) { + if (ref_seq_blk_used == -1) + ref_seq_blk_used = ref_seq_blk; + else if (ref_seq_blk_used != ref_seq_blk) + fprintf(samtools_stderr, "Embedded reference is not consistently using the same Content-Id.\n" + "Reported figures for reference will be invalid.\n"); + } + + // Slice data blocks + for (j = 0; j < num_blocks; j++) { + // read and discard, unless it's the ref-ID block + if (!(blk = cram_read_block(in_c))) + goto err; + + int32_t csize = cram_block_get_comp_size(blk); + int32_t usize = cram_block_get_uncomp_size(blk); + int cid = cram_block_get_content_id(blk); + enum cram_block_method method = cram_block_get_method(blk); + + // Expand comp to the internal sub-formats, eg + // rANS order-0/1, PACK+RLE, etc. + cram_method_details *cm; + cm = cram_expand_method(cram_block_get_data(blk), + cram_block_get_comp_size(blk), + method); + if (!cm) + goto err; + enum comp_expanded comp + = comp_method2expanded(cm); + free(cm); + + k = kh_put(cu, cu_size, cid, &ret); + if (ret < 0) + goto err; + if (ret == 0) { + kh_value(cu_size, k).csize[comp] += csize; + kh_value(cu_size, k).usize[comp] += usize; + } else { + memset(&kh_value(cu_size, k), 0, sizeof(cusize_t)); + kh_value(cu_size, k).csize[comp] = csize; + kh_value(cu_size, k).usize[comp] = usize; + } + + cram_free_block(blk); + blk = NULL; + } + cram_free_slice_header(shdr); + shdr = NULL; + } + + cram_free_container(c); + c = NULL; + } + + off_t tot_size = report_size(outfp, verbose, ref_seq_blk_used, + cu_size, cid2ds); + if (tot_size < 0) + goto err; + + kh_destroy(cu, cu_size); + cram_cid2ds_free(cid2ds); + + off_t end = htell(hf_in); + + fprintf(outfp, "\n"); + fprintf(outfp, "Number of containers %18"PRId64"\n", ncont); + fprintf(outfp, "Number of slices %18"PRId64"\n", nslice); + fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs); + fprintf(outfp, "Number of bases %18"PRId64"\n", nbases); + fprintf(outfp, "Total file size %18"PRId64"\n", end); + fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size); + + return 0; + + err: + // Report anyway so we can get stats on partial files, but be + // sure to error too. + report_size(outfp, verbose, ref_seq_blk_used, cu_size, cid2ds); + + print_error("cram_size", "Failed in decoding CRAM file"); + if (blk) + cram_free_block(blk); + if (shdr) + cram_free_slice_header(shdr); + if (c) + cram_free_container(c); + if (cid2ds) + cram_cid2ds_free(cid2ds); + + return -1; +} + +/* main() for cram_size */ +int main_cram_size(int argc, char *argv[]) { + int c, usage = 0, verbose = 0, encodings = 0; + sam_hdr_t *h = 0; + hFILE *hf_in = NULL; + samFile *in = NULL; + sam_global_args ga; + FILE *outfp = samtools_stdout; + + static const struct option lopts[] = { + {"output", required_argument, NULL, 'o'}, + {"verbose", no_argument, NULL, 'v'}, + {"encodings", no_argument, NULL, 'e'}, + SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '-'), + { NULL, 0, NULL, 0 } + }; + + sam_global_args_init(&ga); + + while ((c = getopt_long(argc, argv, "vo:e", lopts, NULL)) >= 0) { + switch (c) { + case 'o': + if (!(outfp = fopen(optarg, "w"))) { + perror(optarg); + goto err; + } + break; + + case 'v': + verbose++; + break; + + case 'e': + encodings++; + break; + + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; + /* else fall-through */ + case '?': usage=1; break; + } + } + + if ((optind == argc && isatty(0)) || usage) { + fprintf(samtools_stdout, "Usage: samtools cram_size [-ve] [-o out.size] [in.cram]\n"); + return 0; + } + + char *fn = optind < argc ? argv[optind] : "-"; + + // We want access to in->fp.cram->fp, but this is an opaque struct so we + // can't get that. However we opened with hopen and then reopen as + // CRAM with hts_hopen, which will swallow the initial hFILE and take + // owenership of it. Hence we now know in->fp.cram->fp. + if (!(hf_in = hopen(fn, "r"))) { + print_error_errno("cram_size", "failed to open file '%s'", fn); + return 1; + } + if (!(in = hts_hopen(hf_in, fn, "r"))) { + print_error_errno("cram_size", "failed to open file '%s'", fn); + goto err; + } + + if (!(h = sam_hdr_read(in))) + goto err; + + int ret = cram_size(hf_in, in, h, outfp, verbose, encodings); + sam_hdr_destroy(h); + sam_close(in); + if (outfp != samtools_stdout) + fclose(outfp); + + return ret ? 1 : 0; + + err: + if (in) + sam_close(in); + if (h) + sam_hdr_destroy(h); + + return 1; +} diff --git a/samtools/faidx.c b/samtools/faidx.c index 03b5d65..63204d1 100644 --- a/samtools/faidx.c +++ b/samtools/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- faidx subcommand. - Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -46,7 +46,12 @@ History: #include #include "samtools.h" -#define DEFAULT_FASTA_LINE_LEN 60 +// Negative indicates the same as input data +#define DEFAULT_FASTA_LINE_LEN -60 + +#ifndef ABS +# define ABS(x) ((x)>=0?(x):-(x)) +#endif static unsigned char comp_base[256] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -95,7 +100,7 @@ static void reverse(char *str, const hts_pos_t len) { static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name, - const int ignore, const int length, const hts_pos_t seq_len) { + const int ignore, const hts_pos_t length, const hts_pos_t seq_len) { int id; hts_pos_t beg, end; @@ -110,7 +115,7 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n } else if (seq_len == 0) { fprintf(stderr, "[faidx] Zero length sequence: %s\n", name); } else if (fai_parse_region(faid, name, &id, &beg, &end, 0) - && (end < INT_MAX) && (seq_len != end - beg)) { + && (end < HTS_POS_MAX) && (seq_len != end - beg)) { fprintf(stderr, "[faidx] Truncated sequence: %s\n", name); } @@ -131,10 +136,14 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n static int write_output(faidx_t *faid, FILE *file, const char *name, const int ignore, - const int length, const int rev, + const hts_pos_t length, const int rev, const char *pos_strand_name, const char *neg_strand_name, enum fai_format_options format) { - hts_pos_t seq_len; + hts_pos_t seq_len, wrap_len = length; + if (wrap_len < 0) + wrap_len = fai_line_length(faid, name); + if (wrap_len <= 0) + wrap_len = HTS_POS_MAX; char *seq = fai_fetch64(faid, name, &seq_len); if (format == FAI_FASTA) { @@ -147,7 +156,7 @@ static int write_output(faidx_t *faid, FILE *file, const char *name, const int i reverse_complement(seq, seq_len); } - if (write_line(faid, file, seq, name, ignore, length, seq_len) + if (write_line(faid, file, seq, name, ignore, wrap_len, seq_len) == EXIT_FAILURE) { free(seq); return EXIT_FAILURE; @@ -164,7 +173,7 @@ static int write_output(faidx_t *faid, FILE *file, const char *name, const int i reverse(qual, seq_len); } - if (write_line(faid, file, qual, name, ignore, length, seq_len) + if (write_line(faid, file, qual, name, ignore, wrap_len, seq_len) == EXIT_FAILURE) { free(qual); return EXIT_FAILURE; @@ -178,7 +187,7 @@ static int write_output(faidx_t *faid, FILE *file, const char *name, const int i static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, const int ignore, - const int length, const int rev, + const hts_pos_t length, const int rev, const char *pos_strand_name, const char *neg_strand_name, enum fai_format_options format) { @@ -239,7 +248,7 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status) int faidx_core(int argc, char *argv[], enum fai_format_options format) { int c, ignore_error = 0, rev = 0; - int line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */ + hts_pos_t line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */ char* output_file = NULL; /* output file (default is stdout ) */ char *region_file = NULL; // list of regions from file, one per line char *pos_strand_name = ""; // Extension to add to name for +ve strand @@ -266,11 +275,11 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) while ((c = getopt_long(argc, argv, "ho:n:cr:fi", lopts, NULL)) >= 0) { switch (c) { case 'o': output_file = optarg; break; - case 'n': line_len = atoi(optarg); - if(line_len<1) { - fprintf(stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,DEFAULT_FASTA_LINE_LEN); - line_len= DEFAULT_FASTA_LINE_LEN ; - } + case 'n': line_len = strtol(optarg, NULL, 10); + if (line_len < 0) { + fprintf(stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,ABS(DEFAULT_FASTA_LINE_LEN)); + line_len= ABS(DEFAULT_FASTA_LINE_LEN); + } break; case 'c': ignore_error = 1; break; case 'r': region_file = optarg; break; diff --git a/samtools/faidx.c.pysam.c b/samtools/faidx.c.pysam.c index 0bc515b..6160661 100644 --- a/samtools/faidx.c.pysam.c +++ b/samtools/faidx.c.pysam.c @@ -2,7 +2,7 @@ /* faidx.c -- faidx subcommand. - Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -48,7 +48,12 @@ History: #include #include "samtools.h" -#define DEFAULT_FASTA_LINE_LEN 60 +// Negative indicates the same as input data +#define DEFAULT_FASTA_LINE_LEN -60 + +#ifndef ABS +# define ABS(x) ((x)>=0?(x):-(x)) +#endif static unsigned char comp_base[256] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, @@ -97,7 +102,7 @@ static void reverse(char *str, const hts_pos_t len) { static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name, - const int ignore, const int length, const hts_pos_t seq_len) { + const int ignore, const hts_pos_t length, const hts_pos_t seq_len) { int id; hts_pos_t beg, end; @@ -112,7 +117,7 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n } else if (seq_len == 0) { fprintf(samtools_stderr, "[faidx] Zero length sequence: %s\n", name); } else if (fai_parse_region(faid, name, &id, &beg, &end, 0) - && (end < INT_MAX) && (seq_len != end - beg)) { + && (end < HTS_POS_MAX) && (seq_len != end - beg)) { fprintf(samtools_stderr, "[faidx] Truncated sequence: %s\n", name); } @@ -133,10 +138,14 @@ static int write_line(faidx_t *faid, FILE *file, const char *line, const char *n static int write_output(faidx_t *faid, FILE *file, const char *name, const int ignore, - const int length, const int rev, + const hts_pos_t length, const int rev, const char *pos_strand_name, const char *neg_strand_name, enum fai_format_options format) { - hts_pos_t seq_len; + hts_pos_t seq_len, wrap_len = length; + if (wrap_len < 0) + wrap_len = fai_line_length(faid, name); + if (wrap_len <= 0) + wrap_len = HTS_POS_MAX; char *seq = fai_fetch64(faid, name, &seq_len); if (format == FAI_FASTA) { @@ -149,7 +158,7 @@ static int write_output(faidx_t *faid, FILE *file, const char *name, const int i reverse_complement(seq, seq_len); } - if (write_line(faid, file, seq, name, ignore, length, seq_len) + if (write_line(faid, file, seq, name, ignore, wrap_len, seq_len) == EXIT_FAILURE) { free(seq); return EXIT_FAILURE; @@ -166,7 +175,7 @@ static int write_output(faidx_t *faid, FILE *file, const char *name, const int i reverse(qual, seq_len); } - if (write_line(faid, file, qual, name, ignore, length, seq_len) + if (write_line(faid, file, qual, name, ignore, wrap_len, seq_len) == EXIT_FAILURE) { free(qual); return EXIT_FAILURE; @@ -180,7 +189,7 @@ static int write_output(faidx_t *faid, FILE *file, const char *name, const int i static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, const int ignore, - const int length, const int rev, + const hts_pos_t length, const int rev, const char *pos_strand_name, const char *neg_strand_name, enum fai_format_options format) { @@ -241,7 +250,7 @@ static int usage(FILE *fp, enum fai_format_options format, int exit_status) int faidx_core(int argc, char *argv[], enum fai_format_options format) { int c, ignore_error = 0, rev = 0; - int line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */ + hts_pos_t line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */ char* output_file = NULL; /* output file (default is samtools_stdout ) */ char *region_file = NULL; // list of regions from file, one per line char *pos_strand_name = ""; // Extension to add to name for +ve strand @@ -268,11 +277,11 @@ int faidx_core(int argc, char *argv[], enum fai_format_options format) while ((c = getopt_long(argc, argv, "ho:n:cr:fi", lopts, NULL)) >= 0) { switch (c) { case 'o': output_file = optarg; break; - case 'n': line_len = atoi(optarg); - if(line_len<1) { - fprintf(samtools_stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,DEFAULT_FASTA_LINE_LEN); - line_len= DEFAULT_FASTA_LINE_LEN ; - } + case 'n': line_len = strtol(optarg, NULL, 10); + if (line_len < 0) { + fprintf(samtools_stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,ABS(DEFAULT_FASTA_LINE_LEN)); + line_len= ABS(DEFAULT_FASTA_LINE_LEN); + } break; case 'c': ignore_error = 1; break; case 'r': region_file = optarg; break; diff --git a/samtools/reset.c b/samtools/reset.c new file mode 100644 index 0000000..f9b0c09 --- /dev/null +++ b/samtools/reset.c @@ -0,0 +1,585 @@ +/* reset.c -- removes aligner updates and reference data from input sam / + bam / cram file and makes read data raw for new processing + + Copyright (C) 2022, 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +#include "samtools.h" +#include "htslib/sam.h" +#include "sam_opts.h" +#include "htslib/thread_pool.h" +#include "htslib/khash.h" +#include "sam_utils.h" +#include + +#define TAGNUM(X) (((X)[0] << 8) | (X)[1]) //to create key for aux tags, like type key in htslib +#define LONG_OPT(X) (128 + (X)) //to handle long and short options with same char + +typedef struct conf_data +{ + int keepRGs; //RG line handling + int noPGentry; //PG line for reset op or not + auxhash_t aux_keep; //SET that holds the aux tags to be retained + auxhash_t aux_remove; //SET that holds the aux tags to be removed + char *pgid; //PG id onwards which to be removed +} conf_data; + +/// usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped +returns nothing +*/ +static void usage(FILE *fp) +{ + fprintf(fp, "Usage: samtools reset [options]\n\ + -o FILE Output file\n\ + -x, --remove-tag STR\n\ + Aux tags to be removed\n\ + --keep-tag STR\n\ + Aux tags to be retained. Equivalent to -x ^STR\n\ + --reject-PG ID\n\ + Removes PG line with ID matching to input and succeeding PG lines\n\ + --no-RG To have RG lines or not\n\ + --no-PG To have PG entry or not for reset operation\n"); + + sam_global_opt_help(fp, "--O--@--"); + return; +} + +/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set +/** @param bamdata - pointer to the bamdata from which needs the filtering + * @param config - pointer to conf_data +returns nothing +*/ +void removeauxtags(bam1_t *bamdata, conf_data *config) +{ + uint8_t *auxdata = NULL; + const char *tag = NULL, rg[] = "RG"; + khint_t iter = 0; + int ret = 0; + + if (!bamdata || !config || (!config->aux_keep && !config->aux_remove && config->keepRGs)) + return; + + //remove RG tags from bamdata if keepRG is false + if (!config->keepRGs) { + if (!config->aux_keep && !config->aux_remove) { + //none of aux tag filter in use, create remove filter + config->aux_remove = kh_init(aux_exists); + } + + if (config->aux_keep) { + //keep set in use, remove RG if present + iter = kh_get(aux_exists, config->aux_keep, TAGNUM(rg)); + if (iter != kh_end(config->aux_keep)) { + kh_del(aux_exists, config->aux_keep, iter); + } + } + if (config->aux_remove) { + //remove set in use, add RG if not present + iter = kh_get(aux_exists, config->aux_remove, TAGNUM(rg)); + if (iter == kh_end(config->aux_remove)) { + kh_put(aux_exists, config->aux_remove, TAGNUM(rg), &ret); + } + } + } + + for (auxdata = bam_aux_first(bamdata); auxdata; ) { + tag = bam_aux_tag(auxdata); + if (config->aux_keep) { //keep option or remove option with ^ in use + iter = kh_get(aux_exists, config->aux_keep, TAGNUM(tag)); + if (iter == kh_end(config->aux_keep)) { //not present in keep, remove + auxdata = bam_aux_remove(bamdata, auxdata); + } + else { //present, keep + auxdata = bam_aux_next(bamdata, auxdata); + } + } + else if (config->aux_remove) { //remove option in use + iter = kh_get(aux_exists, config->aux_remove, TAGNUM(tag)); + if (iter != kh_end(config->aux_remove)) { //present in remove, remove + auxdata = bam_aux_remove(bamdata, auxdata); + } + else { //not present, keep + auxdata = bam_aux_next(bamdata, auxdata); + } + } + //else impossible + } +} + +/// getRGlines - add RG lines from input header to output header +/** @param in_samhdr - pointer to input sam header data + * @param out_samhdr - pointer to output sam header data +returns 1 on failure 0 on success +*/ +int getRGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr) +{ + kstring_t line = KS_INITIALIZE; + int i = 0, ret = 0, count = 0; + const char rg[] = "RG"; + + if (!in_samhdr || !out_samhdr) { + fprintf(stderr, "Invalid parameters in getRGlines!\n"); + return 1; + } + + if (-1 == (count = sam_hdr_count_lines(in_samhdr, rg))) { + fprintf(stderr, "Failed to get RG count!\n"); + return 1; + } + + for (i = 0; i < count; ++i) + { + ks_clear(&line); + if (sam_hdr_find_line_pos(in_samhdr, rg, i, &line)) { + fprintf(stderr, "Failed to get RG data!\n"); + ret = 1; + break; + } + if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) { + fprintf(stderr, "Failed to add RG data!\n"); + ret = 1; + break; + } + } + ks_free(&line); + + return ret; +} + +/// getPGlines - add PG lines from input header to output header based on user option +/** @param in_samhdr - pointer to input sam header data + * @param out_samhdr - pointer to output sam header data + * @param config - pointer to internal configuration data + * @param argdump - string containing dump of command line invocation +returns 1 on failure 0 on success +*/ +int getPGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr, conf_data *config, const char *argdump) +{ + kstring_t line = KS_INITIALIZE, id = KS_INITIALIZE; + int i = 0, ret = 0, count = 0; + const char pg[] = "PG"; + + if (!in_samhdr || !out_samhdr || !config) { + fprintf(stderr, "Invalid parameters in getPGlines!\n"); + return 1; + } + + if (-1 == (count = sam_hdr_count_lines(in_samhdr, pg))) { + fprintf(stderr, "Failed to get PG count!\n"); + return 1; + } + + if (config->pgid && config->pgid[0]) { //when reject-PG is given, and is not empty, remove given pg onwards + for (i = 0; i < count; ++i) { + if (sam_hdr_find_tag_pos(in_samhdr, pg, i, "ID", &id)) { + fprintf(stderr, "Failed to get PG entry fields for line %d!\n", i + 1); + break; + } + + if (!strcmp(id.s, config->pgid)) + break; + + //either current PG is prior to rejected one or all PGs are in, get PG line and add + ks_clear(&line); + if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) { + fprintf(stderr, "Failed to get PG data at %d!\n", i + 1); + ret = 1; + break; + } + + //add to output + if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) { + fprintf(stderr, "Failed to add PG data!\n"); + ret = 1; + break; + } + } + } + else { //keep all + for (i = 0; i < count; ++i) { + if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) { + fprintf(stderr, "Failed to get PG data at %d!\n", i + 1); + ret = 1; + break; + } + //line has the required PG data + if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) { + fprintf(stderr, "Failed to add PG data!\n"); + ret = 1; + break; + } + } + } + + if (!ret && !config->noPGentry) { + //add PG entry with reset command + if (-1 == (ret = sam_hdr_add_pg(out_samhdr, "samtools", "CL", argdump, NULL))) { + fprintf(stderr, "Failed to set PG entry!\n"); + } + } + ks_free(&line); + ks_free(&id); + + return ret; +} + +/// reset - do the reset of data and create output; create output header with required rg/pg data, add bamdata with flags set to unmapped, pair info and orientation reset, +// reerse and complement alignment if required +/** @param infile - input samfile pointer + * @param outfile - output sam file pointer + * @param config - pointer to internal configuration data + * @param args - string containing dump of command line invocation +returns 1 on failure 0 on success +*/ +int reset(samFile *infile, samFile *outfile, conf_data *config, char *args) +{ + sam_hdr_t *in_samhdr = NULL, *out_samhdr = NULL; + int ret = EXIT_FAILURE, ret_r = 0, ret_w = 0, i = 0; + bam1_t *bamdata = NULL, *outdata = NULL; + kstring_t querydata = KS_INITIALIZE, qualdata = KS_INITIALIZE; + char *sp = NULL, *qp = NULL; + uint8_t *bamquery = NULL, *bamqual = NULL; + + if (!infile || !outfile) { + fprintf(stderr, "Invalid parameters in reset!\n"); + goto error; + } + + //read input header + in_samhdr = sam_hdr_read(infile); + if (!in_samhdr) + { + fprintf(stderr, "Failed to read header from file!\n"); + goto error; + } + //create output header + if (!(out_samhdr = sam_hdr_init())) + { + fprintf(stderr, "Failed to create output header!\n"); + goto error; + } + + //add version to output header + if (-1 == sam_hdr_add_line(out_samhdr,"HD", "VN", SAM_FORMAT_VERSION, NULL)) { + fprintf(stderr, "Failed to set header data!\n"); + goto error; + } + //add RG / PG lines if configured + if ((config->keepRGs && getRGlines(in_samhdr, out_samhdr)) || + getPGlines(in_samhdr, out_samhdr, config, args)) { + goto error; + } + + //write output header + if (sam_hdr_write(outfile, out_samhdr)) { + print_error_errno("reset", "Output header write failed (%d)!\n", errno); + goto error; + } + + bamdata = bam_init1(); //input bam + outdata = bam_init1(); //output bam + if (!bamdata || !outdata) + { + fprintf(stderr, "Failed to allocate data memory!\n"); + goto error; + } + + errno = 0; i = 0; + sp = NULL; qp = NULL; + bamquery = NULL; bamqual = NULL; + + //get bam data, make updates and dump to output + while (0 <= (ret_r = sam_read1(infile, in_samhdr, bamdata))) + { + sp = NULL; qp = NULL; + bamquery = NULL; bamqual = NULL; + + // read data + if (bamdata->core.flag & BAM_FSECONDARY || bamdata->core.flag & BAM_FSUPPLEMENTARY) { + continue; + } + + //update flags + uint16_t flags = bamdata->core.flag & ~BAM_FPROPER_PAIR; //reset pair info + flags |= BAM_FUNMAP; //mark as unmapped + if (bamdata->core.flag & BAM_FPAIRED) { + flags |= BAM_FMUNMAP; //mark mate as unmapped, if it was a pair + } + flags &= ~BAM_FMREVERSE; //reset mate orientation + + if (0 > ks_resize(&querydata, bamdata->core.l_qseq) || + 0 > ks_resize(&qualdata, bamdata->core.l_qseq)) { + fprintf(stderr, "Failed to get allocate memory!\n"); + ret_r = -4; + break; + } + ks_clear(&querydata); + ks_clear(&qualdata); + + sp = ks_str(&querydata); + qp = ks_str(&qualdata); + bamquery = bam_get_seq(bamdata); + bamqual = bam_get_qual(bamdata); + if (bamdata->core.flag & BAM_FREVERSE) { + //sequence data ordered as reverse complemented, reorder/complement sequence and quality data as read and clear the flag + for (i = bamdata->core.l_qseq - 1; i >= 0; --i) { + *sp++ = "=TGKCYSBAWRDMHVN"[bam_seqi(bamquery, i)]; + *qp++ = bamqual[i]; + } + flags &= ~BAM_FREVERSE; //reset flag as well + } + else { + //data in read order itself + for (i = 0; i < bamdata->core.l_qseq ; ++i) { + *sp++ = seq_nt16_str[bam_seqi(bamquery, i)]; + } + memcpy(qp, bam_get_qual(bamdata), bamdata->core.l_qseq); + } + + removeauxtags(bamdata, config); + if (0 > (ret_w = bam_set1(outdata, bamdata->core.l_qname - bamdata->core.l_extranul - 1, bam_get_qname(bamdata), flags, -1, -1, 0, 0, NULL, -1, -1, 0, bamdata->core.l_qseq, querydata.s, qualdata.s, bam_get_l_aux(bamdata)))) { + print_error_errno("reset", "Failed to set output data (%d)!\n", errno); + break; + } + + memcpy(bam_get_aux(outdata), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + outdata->l_data += bam_get_l_aux(bamdata); + + errno = 0; + //write bam data to output + if (0 > (ret_w = sam_write1(outfile, out_samhdr, outdata))) + { + print_error_errno("reset", "Failed to write output data (%d)!\n", errno); + break; + } + // wrote the data, continue read/write cycle + errno = 0; + } + + if (-1 > ret_r || 0 > ret_w) { + //some error + fprintf(stderr, "Error during %s!\n", (-1 > ret_r)? "read" : "write"); + } + else { + // no error! + ret = EXIT_SUCCESS; + } + +error: + // clean up and return result + if (in_samhdr) + sam_hdr_destroy(in_samhdr); + if (out_samhdr) + sam_hdr_destroy(out_samhdr); + + if (bamdata) + bam_destroy1(bamdata); + if (outdata) + bam_destroy1(outdata); + + if (qualdata.s) + ks_free(&qualdata); + if (querydata.s) + ks_free(&querydata); + return ret; +} + +/// cleanup - free up allocations made +/** @param config - pointer to internal configuration data +returns nothing +*/ +void cleanup(conf_data *config) +{ + if (config->aux_keep) { + kh_destroy(aux_exists, config->aux_keep); + config->aux_keep = NULL; + } + if (config->aux_remove) { + kh_destroy(aux_exists, config->aux_remove); + config->aux_remove = NULL; + } +} + +/// main_reset - starts the reset of data +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main_reset(int argc, char *argv[]) +{ + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', '-', 'O', '-', '-', '@'), //let output format and thread count be given by user - long options + {"keep-tag", required_argument, NULL, LONG_OPT('x')}, //aux tags to be retained, supports ^ STR + {"remove-tag", required_argument, NULL, 'x'}, //aux tags to be removed + {"no-RG", no_argument, NULL, 1}, //no RG lines in output, default is to keep them + //reject PG lines from input, default is to keep them (i.e. option not given); without optional filename, all PGs removed and those given in file are filtered when optional filename is given + {"reject-PG", required_argument, NULL, 'p'}, //reject entries from this PG onwards + {"no-PG", no_argument, NULL, 2}, //do not add PG entry for reset operation, default is to add it + {NULL, 0, NULL, 0} + }; + samFile *infile = NULL, *outfile = NULL; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool tpool = {NULL, 0}; + const char *inname = NULL, *outname = NULL; + int c = 0, ret = EXIT_FAILURE; + char outmode[4] = "w", *args = NULL; + conf_data resetconf = {1, 0, NULL, NULL, NULL}; //keep RGs and PGs by default + + + //samtools reset -o outfile -x/--remove-tag ... --keep-tag ... --threads=n --output-fmt=fmt --no-RG --reject-PG pgid --no-PG [] + while ((c = getopt_long(argc, argv, "o:@:x:O:", lopts, NULL)) >= 0) + { + switch (c) + { + case 1: //--no-RG + if (!resetconf.keepRGs) { + usage(stderr); //already given! + goto exit; + } + resetconf.keepRGs = 0; + break; + case 2: //--no-PG + if (resetconf.noPGentry) { + usage(stderr); //already given! + goto exit; + } + resetconf.noPGentry = 1; + break; + case 'p': //--reject-PG= + if (resetconf.pgid) { + usage(stderr); //already given! + goto exit; + } + resetconf.pgid = optarg; + break; + case 'o': //output file name + if (outname) { //already given! + usage(stderr); + goto exit; + + } + outname = optarg; + break; + case 'x': //remove aux tag + if (*optarg == '^') { //remove all except given ones! + if (parse_aux_list(&resetconf.aux_keep, optarg+1, "main_reset")) { + usage(stderr); + goto exit; + } + } + else { //remove given ones + if (parse_aux_list(&resetconf.aux_remove, optarg, "main_reset")) { + usage(stderr); + goto exit; + } + } + break; + case LONG_OPT('x'): //keep aux tags + if (parse_aux_list(&resetconf.aux_keep, optarg, "main_reset")) { + usage(stderr); + goto exit; + } + break; + // handle standard samtool options like thread count, verbosity... + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) + break; + // else fall-through + // couldn't parse or unknown options, show usage! + case '?': //unknown options found! + usage(stderr); + goto exit; + break; + } + } + + if (argc == 1 && isatty(STDIN_FILENO)) { + //no args and input is stdin -- it is the usage check + usage(stdout); + ret = EXIT_SUCCESS; + goto exit; + } + //else have other args or input from redirection/pipe/other device -- validate and work + + if (!outname) + outname = "-"; + + //check and fail if unnecessary parameters are given + c = argc - optind; + if (c > 1) { + usage(stderr); + goto exit; + } + + if (c == 1) { + inname = argv[optind]; + } + else { + inname = "-"; + } + + //set output file format based on name + sam_open_mode(outmode + 1, outname, NULL); + + //open input and output files + infile = sam_open(inname, "r"); + outfile = sam_open_format(outname, outmode, &ga.out); + if (!infile || !outfile) { + fprintf(stderr, "Could not open %s%s%s\n", !infile ? inname : "", (!infile && !outfile)? ", " : "", !outfile ? outname : ""); + goto exit; + } + + // set the thread count if given as argument + if (ga.nthreads > 0) + { + if (!(tpool.pool = hts_tpool_init(ga.nthreads))) + { + fprintf(stderr, "\nFailed to setup thread pool\n"); + goto exit; + } + + hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool); + hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool); + } + + args = stringify_argv(argc + 1, argv - 1); //to dump invocation in PG line + + //do the reset! + ret = reset(infile, outfile, &resetconf, args); + +exit: + if (args) + free(args); + if (infile) + sam_close(infile); + if (outfile) + sam_close(outfile); + if (tpool.pool) + hts_tpool_destroy(tpool.pool); + cleanup(&resetconf); + sam_global_args_free(&ga); + + return ret; +} diff --git a/samtools/reset.c.pysam.c b/samtools/reset.c.pysam.c new file mode 100644 index 0000000..fdf44b9 --- /dev/null +++ b/samtools/reset.c.pysam.c @@ -0,0 +1,587 @@ +#include "samtools.pysam.h" + +/* reset.c -- removes aligner updates and reference data from input sam / + bam / cram file and makes read data raw for new processing + + Copyright (C) 2022, 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +#include "samtools.h" +#include "htslib/sam.h" +#include "sam_opts.h" +#include "htslib/thread_pool.h" +#include "htslib/khash.h" +#include "sam_utils.h" +#include + +#define TAGNUM(X) (((X)[0] << 8) | (X)[1]) //to create key for aux tags, like type key in htslib +#define LONG_OPT(X) (128 + (X)) //to handle long and short options with same char + +typedef struct conf_data +{ + int keepRGs; //RG line handling + int noPGentry; //PG line for reset op or not + auxhash_t aux_keep; //SET that holds the aux tags to be retained + auxhash_t aux_remove; //SET that holds the aux tags to be removed + char *pgid; //PG id onwards which to be removed +} conf_data; + +/// usage - print the usage +/** @param fp pointer to the file / terminal to which usage to be dumped +returns nothing +*/ +static void usage(FILE *fp) +{ + fprintf(fp, "Usage: samtools reset [options]\n\ + -o FILE Output file\n\ + -x, --remove-tag STR\n\ + Aux tags to be removed\n\ + --keep-tag STR\n\ + Aux tags to be retained. Equivalent to -x ^STR\n\ + --reject-PG ID\n\ + Removes PG line with ID matching to input and succeeding PG lines\n\ + --no-RG To have RG lines or not\n\ + --no-PG To have PG entry or not for reset operation\n"); + + sam_global_opt_help(fp, "--O--@--"); + return; +} + +/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set +/** @param bamdata - pointer to the bamdata from which needs the filtering + * @param config - pointer to conf_data +returns nothing +*/ +void removeauxtags(bam1_t *bamdata, conf_data *config) +{ + uint8_t *auxdata = NULL; + const char *tag = NULL, rg[] = "RG"; + khint_t iter = 0; + int ret = 0; + + if (!bamdata || !config || (!config->aux_keep && !config->aux_remove && config->keepRGs)) + return; + + //remove RG tags from bamdata if keepRG is false + if (!config->keepRGs) { + if (!config->aux_keep && !config->aux_remove) { + //none of aux tag filter in use, create remove filter + config->aux_remove = kh_init(aux_exists); + } + + if (config->aux_keep) { + //keep set in use, remove RG if present + iter = kh_get(aux_exists, config->aux_keep, TAGNUM(rg)); + if (iter != kh_end(config->aux_keep)) { + kh_del(aux_exists, config->aux_keep, iter); + } + } + if (config->aux_remove) { + //remove set in use, add RG if not present + iter = kh_get(aux_exists, config->aux_remove, TAGNUM(rg)); + if (iter == kh_end(config->aux_remove)) { + kh_put(aux_exists, config->aux_remove, TAGNUM(rg), &ret); + } + } + } + + for (auxdata = bam_aux_first(bamdata); auxdata; ) { + tag = bam_aux_tag(auxdata); + if (config->aux_keep) { //keep option or remove option with ^ in use + iter = kh_get(aux_exists, config->aux_keep, TAGNUM(tag)); + if (iter == kh_end(config->aux_keep)) { //not present in keep, remove + auxdata = bam_aux_remove(bamdata, auxdata); + } + else { //present, keep + auxdata = bam_aux_next(bamdata, auxdata); + } + } + else if (config->aux_remove) { //remove option in use + iter = kh_get(aux_exists, config->aux_remove, TAGNUM(tag)); + if (iter != kh_end(config->aux_remove)) { //present in remove, remove + auxdata = bam_aux_remove(bamdata, auxdata); + } + else { //not present, keep + auxdata = bam_aux_next(bamdata, auxdata); + } + } + //else impossible + } +} + +/// getRGlines - add RG lines from input header to output header +/** @param in_samhdr - pointer to input sam header data + * @param out_samhdr - pointer to output sam header data +returns 1 on failure 0 on success +*/ +int getRGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr) +{ + kstring_t line = KS_INITIALIZE; + int i = 0, ret = 0, count = 0; + const char rg[] = "RG"; + + if (!in_samhdr || !out_samhdr) { + fprintf(samtools_stderr, "Invalid parameters in getRGlines!\n"); + return 1; + } + + if (-1 == (count = sam_hdr_count_lines(in_samhdr, rg))) { + fprintf(samtools_stderr, "Failed to get RG count!\n"); + return 1; + } + + for (i = 0; i < count; ++i) + { + ks_clear(&line); + if (sam_hdr_find_line_pos(in_samhdr, rg, i, &line)) { + fprintf(samtools_stderr, "Failed to get RG data!\n"); + ret = 1; + break; + } + if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) { + fprintf(samtools_stderr, "Failed to add RG data!\n"); + ret = 1; + break; + } + } + ks_free(&line); + + return ret; +} + +/// getPGlines - add PG lines from input header to output header based on user option +/** @param in_samhdr - pointer to input sam header data + * @param out_samhdr - pointer to output sam header data + * @param config - pointer to internal configuration data + * @param argdump - string containing dump of command line invocation +returns 1 on failure 0 on success +*/ +int getPGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr, conf_data *config, const char *argdump) +{ + kstring_t line = KS_INITIALIZE, id = KS_INITIALIZE; + int i = 0, ret = 0, count = 0; + const char pg[] = "PG"; + + if (!in_samhdr || !out_samhdr || !config) { + fprintf(samtools_stderr, "Invalid parameters in getPGlines!\n"); + return 1; + } + + if (-1 == (count = sam_hdr_count_lines(in_samhdr, pg))) { + fprintf(samtools_stderr, "Failed to get PG count!\n"); + return 1; + } + + if (config->pgid && config->pgid[0]) { //when reject-PG is given, and is not empty, remove given pg onwards + for (i = 0; i < count; ++i) { + if (sam_hdr_find_tag_pos(in_samhdr, pg, i, "ID", &id)) { + fprintf(samtools_stderr, "Failed to get PG entry fields for line %d!\n", i + 1); + break; + } + + if (!strcmp(id.s, config->pgid)) + break; + + //either current PG is prior to rejected one or all PGs are in, get PG line and add + ks_clear(&line); + if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) { + fprintf(samtools_stderr, "Failed to get PG data at %d!\n", i + 1); + ret = 1; + break; + } + + //add to output + if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) { + fprintf(samtools_stderr, "Failed to add PG data!\n"); + ret = 1; + break; + } + } + } + else { //keep all + for (i = 0; i < count; ++i) { + if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) { + fprintf(samtools_stderr, "Failed to get PG data at %d!\n", i + 1); + ret = 1; + break; + } + //line has the required PG data + if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) { + fprintf(samtools_stderr, "Failed to add PG data!\n"); + ret = 1; + break; + } + } + } + + if (!ret && !config->noPGentry) { + //add PG entry with reset command + if (-1 == (ret = sam_hdr_add_pg(out_samhdr, "samtools", "CL", argdump, NULL))) { + fprintf(samtools_stderr, "Failed to set PG entry!\n"); + } + } + ks_free(&line); + ks_free(&id); + + return ret; +} + +/// reset - do the reset of data and create output; create output header with required rg/pg data, add bamdata with flags set to unmapped, pair info and orientation reset, +// reerse and complement alignment if required +/** @param infile - input samfile pointer + * @param outfile - output sam file pointer + * @param config - pointer to internal configuration data + * @param args - string containing dump of command line invocation +returns 1 on failure 0 on success +*/ +int reset(samFile *infile, samFile *outfile, conf_data *config, char *args) +{ + sam_hdr_t *in_samhdr = NULL, *out_samhdr = NULL; + int ret = EXIT_FAILURE, ret_r = 0, ret_w = 0, i = 0; + bam1_t *bamdata = NULL, *outdata = NULL; + kstring_t querydata = KS_INITIALIZE, qualdata = KS_INITIALIZE; + char *sp = NULL, *qp = NULL; + uint8_t *bamquery = NULL, *bamqual = NULL; + + if (!infile || !outfile) { + fprintf(samtools_stderr, "Invalid parameters in reset!\n"); + goto error; + } + + //read input header + in_samhdr = sam_hdr_read(infile); + if (!in_samhdr) + { + fprintf(samtools_stderr, "Failed to read header from file!\n"); + goto error; + } + //create output header + if (!(out_samhdr = sam_hdr_init())) + { + fprintf(samtools_stderr, "Failed to create output header!\n"); + goto error; + } + + //add version to output header + if (-1 == sam_hdr_add_line(out_samhdr,"HD", "VN", SAM_FORMAT_VERSION, NULL)) { + fprintf(samtools_stderr, "Failed to set header data!\n"); + goto error; + } + //add RG / PG lines if configured + if ((config->keepRGs && getRGlines(in_samhdr, out_samhdr)) || + getPGlines(in_samhdr, out_samhdr, config, args)) { + goto error; + } + + //write output header + if (sam_hdr_write(outfile, out_samhdr)) { + print_error_errno("reset", "Output header write failed (%d)!\n", errno); + goto error; + } + + bamdata = bam_init1(); //input bam + outdata = bam_init1(); //output bam + if (!bamdata || !outdata) + { + fprintf(samtools_stderr, "Failed to allocate data memory!\n"); + goto error; + } + + errno = 0; i = 0; + sp = NULL; qp = NULL; + bamquery = NULL; bamqual = NULL; + + //get bam data, make updates and dump to output + while (0 <= (ret_r = sam_read1(infile, in_samhdr, bamdata))) + { + sp = NULL; qp = NULL; + bamquery = NULL; bamqual = NULL; + + // read data + if (bamdata->core.flag & BAM_FSECONDARY || bamdata->core.flag & BAM_FSUPPLEMENTARY) { + continue; + } + + //update flags + uint16_t flags = bamdata->core.flag & ~BAM_FPROPER_PAIR; //reset pair info + flags |= BAM_FUNMAP; //mark as unmapped + if (bamdata->core.flag & BAM_FPAIRED) { + flags |= BAM_FMUNMAP; //mark mate as unmapped, if it was a pair + } + flags &= ~BAM_FMREVERSE; //reset mate orientation + + if (0 > ks_resize(&querydata, bamdata->core.l_qseq) || + 0 > ks_resize(&qualdata, bamdata->core.l_qseq)) { + fprintf(samtools_stderr, "Failed to get allocate memory!\n"); + ret_r = -4; + break; + } + ks_clear(&querydata); + ks_clear(&qualdata); + + sp = ks_str(&querydata); + qp = ks_str(&qualdata); + bamquery = bam_get_seq(bamdata); + bamqual = bam_get_qual(bamdata); + if (bamdata->core.flag & BAM_FREVERSE) { + //sequence data ordered as reverse complemented, reorder/complement sequence and quality data as read and clear the flag + for (i = bamdata->core.l_qseq - 1; i >= 0; --i) { + *sp++ = "=TGKCYSBAWRDMHVN"[bam_seqi(bamquery, i)]; + *qp++ = bamqual[i]; + } + flags &= ~BAM_FREVERSE; //reset flag as well + } + else { + //data in read order itself + for (i = 0; i < bamdata->core.l_qseq ; ++i) { + *sp++ = seq_nt16_str[bam_seqi(bamquery, i)]; + } + memcpy(qp, bam_get_qual(bamdata), bamdata->core.l_qseq); + } + + removeauxtags(bamdata, config); + if (0 > (ret_w = bam_set1(outdata, bamdata->core.l_qname - bamdata->core.l_extranul - 1, bam_get_qname(bamdata), flags, -1, -1, 0, 0, NULL, -1, -1, 0, bamdata->core.l_qseq, querydata.s, qualdata.s, bam_get_l_aux(bamdata)))) { + print_error_errno("reset", "Failed to set output data (%d)!\n", errno); + break; + } + + memcpy(bam_get_aux(outdata), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + outdata->l_data += bam_get_l_aux(bamdata); + + errno = 0; + //write bam data to output + if (0 > (ret_w = sam_write1(outfile, out_samhdr, outdata))) + { + print_error_errno("reset", "Failed to write output data (%d)!\n", errno); + break; + } + // wrote the data, continue read/write cycle + errno = 0; + } + + if (-1 > ret_r || 0 > ret_w) { + //some error + fprintf(samtools_stderr, "Error during %s!\n", (-1 > ret_r)? "read" : "write"); + } + else { + // no error! + ret = EXIT_SUCCESS; + } + +error: + // clean up and return result + if (in_samhdr) + sam_hdr_destroy(in_samhdr); + if (out_samhdr) + sam_hdr_destroy(out_samhdr); + + if (bamdata) + bam_destroy1(bamdata); + if (outdata) + bam_destroy1(outdata); + + if (qualdata.s) + ks_free(&qualdata); + if (querydata.s) + ks_free(&querydata); + return ret; +} + +/// cleanup - free up allocations made +/** @param config - pointer to internal configuration data +returns nothing +*/ +void cleanup(conf_data *config) +{ + if (config->aux_keep) { + kh_destroy(aux_exists, config->aux_keep); + config->aux_keep = NULL; + } + if (config->aux_remove) { + kh_destroy(aux_exists, config->aux_remove); + config->aux_remove = NULL; + } +} + +/// main_reset - starts the reset of data +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main_reset(int argc, char *argv[]) +{ + static const struct option lopts[] = { + SAM_OPT_GLOBAL_OPTIONS('-', '-', 'O', '-', '-', '@'), //let output format and thread count be given by user - long options + {"keep-tag", required_argument, NULL, LONG_OPT('x')}, //aux tags to be retained, supports ^ STR + {"remove-tag", required_argument, NULL, 'x'}, //aux tags to be removed + {"no-RG", no_argument, NULL, 1}, //no RG lines in output, default is to keep them + //reject PG lines from input, default is to keep them (i.e. option not given); without optional filename, all PGs removed and those given in file are filtered when optional filename is given + {"reject-PG", required_argument, NULL, 'p'}, //reject entries from this PG onwards + {"no-PG", no_argument, NULL, 2}, //do not add PG entry for reset operation, default is to add it + {NULL, 0, NULL, 0} + }; + samFile *infile = NULL, *outfile = NULL; + sam_global_args ga = SAM_GLOBAL_ARGS_INIT; + htsThreadPool tpool = {NULL, 0}; + const char *inname = NULL, *outname = NULL; + int c = 0, ret = EXIT_FAILURE; + char outmode[4] = "w", *args = NULL; + conf_data resetconf = {1, 0, NULL, NULL, NULL}; //keep RGs and PGs by default + + + //samtools reset -o outfile -x/--remove-tag ... --keep-tag ... --threads=n --output-fmt=fmt --no-RG --reject-PG pgid --no-PG [] + while ((c = getopt_long(argc, argv, "o:@:x:O:", lopts, NULL)) >= 0) + { + switch (c) + { + case 1: //--no-RG + if (!resetconf.keepRGs) { + usage(samtools_stderr); //already given! + goto exit; + } + resetconf.keepRGs = 0; + break; + case 2: //--no-PG + if (resetconf.noPGentry) { + usage(samtools_stderr); //already given! + goto exit; + } + resetconf.noPGentry = 1; + break; + case 'p': //--reject-PG= + if (resetconf.pgid) { + usage(samtools_stderr); //already given! + goto exit; + } + resetconf.pgid = optarg; + break; + case 'o': //output file name + if (outname) { //already given! + usage(samtools_stderr); + goto exit; + + } + outname = optarg; + break; + case 'x': //remove aux tag + if (*optarg == '^') { //remove all except given ones! + if (parse_aux_list(&resetconf.aux_keep, optarg+1, "main_reset")) { + usage(samtools_stderr); + goto exit; + } + } + else { //remove given ones + if (parse_aux_list(&resetconf.aux_remove, optarg, "main_reset")) { + usage(samtools_stderr); + goto exit; + } + } + break; + case LONG_OPT('x'): //keep aux tags + if (parse_aux_list(&resetconf.aux_keep, optarg, "main_reset")) { + usage(samtools_stderr); + goto exit; + } + break; + // handle standard samtool options like thread count, verbosity... + default: + if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) + break; + // else fall-through + // couldn't parse or unknown options, show usage! + case '?': //unknown options found! + usage(samtools_stderr); + goto exit; + break; + } + } + + if (argc == 1 && isatty(STDIN_FILENO)) { + //no args and input is stdin -- it is the usage check + usage(samtools_stdout); + ret = EXIT_SUCCESS; + goto exit; + } + //else have other args or input from redirection/pipe/other device -- validate and work + + if (!outname) + outname = "-"; + + //check and fail if unnecessary parameters are given + c = argc - optind; + if (c > 1) { + usage(samtools_stderr); + goto exit; + } + + if (c == 1) { + inname = argv[optind]; + } + else { + inname = "-"; + } + + //set output file format based on name + sam_open_mode(outmode + 1, outname, NULL); + + //open input and output files + infile = sam_open(inname, "r"); + outfile = sam_open_format(outname, outmode, &ga.out); + if (!infile || !outfile) { + fprintf(samtools_stderr, "Could not open %s%s%s\n", !infile ? inname : "", (!infile && !outfile)? ", " : "", !outfile ? outname : ""); + goto exit; + } + + // set the thread count if given as argument + if (ga.nthreads > 0) + { + if (!(tpool.pool = hts_tpool_init(ga.nthreads))) + { + fprintf(samtools_stderr, "\nFailed to setup thread pool\n"); + goto exit; + } + + hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool); + hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool); + } + + args = stringify_argv(argc + 1, argv - 1); //to dump invocation in PG line + + //do the reset! + ret = reset(infile, outfile, &resetconf, args); + +exit: + if (args) + free(args); + if (infile) + sam_close(infile); + if (outfile) + sam_close(outfile); + if (tpool.pool) + hts_tpool_destroy(tpool.pool); + cleanup(&resetconf); + sam_global_args_free(&ga); + + return ret; +} diff --git a/samtools/sam_utils.c b/samtools/sam_utils.c index f105687..d7178b2 100644 --- a/samtools/sam_utils.c +++ b/samtools/sam_utils.c @@ -1,6 +1,6 @@ /* sam_utils.c -- various utilities internal to samtools. - Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. + Copyright (C) 2014-2016, 2018, 2019, 2023 Genome Research Ltd. Author: John Marshall @@ -30,7 +30,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "samtools.h" +#include "sam_utils.h" static htsFile *samtools_stdout = NULL; @@ -149,3 +149,44 @@ char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header) { return fn_idx; } + + +/// parse_aux_list - parses given string for aux tags which are ',' separated +/** @param h - pointer to a SET holding aux tags + * @param optarg - string having the ',' separated aux tags + * @param msgheader - string to be used during error output as a header +returns -1 on failure and 0 on success +moved from sam_view.c to here for common usage at different source files +*/ +int parse_aux_list(auxhash_t *h, char *optarg, const char *msgheader) +{ + if (!*h) + *h = kh_init(aux_exists); + + while (strlen(optarg) >= 2) { + int x = optarg[0]<<8 | optarg[1]; + int ret = 0; + kh_put(aux_exists, *h, x, &ret); + if (ret < 0) { + kh_destroy(aux_exists, *h); + *h = NULL; + return -1; + } + + optarg += 2; + if (*optarg == ',') // allow white-space too for easy `cat file`? + optarg++; + else if (*optarg != 0) + break; + } + + if (strlen(optarg) != 0) { + fprintf(stderr, "%s: Error parsing option, " + "auxiliary tags should be exactly two characters long.\n", msgheader ? msgheader : ""); + kh_destroy(aux_exists, *h); + *h = NULL; + return -1; + } + + return 0; +} diff --git a/samtools/sam_utils.c.pysam.c b/samtools/sam_utils.c.pysam.c index a5f08a9..304dd38 100644 --- a/samtools/sam_utils.c.pysam.c +++ b/samtools/sam_utils.c.pysam.c @@ -2,7 +2,7 @@ /* sam_utils.c -- various utilities internal to samtools. - Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd. + Copyright (C) 2014-2016, 2018, 2019, 2023 Genome Research Ltd. Author: John Marshall @@ -32,7 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "samtools.h" +#include "sam_utils.h" static htsFile *samtools_stdout_internal = NULL; @@ -151,3 +151,44 @@ char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header) { return fn_idx; } + + +/// parse_aux_list - parses given string for aux tags which are ',' separated +/** @param h - pointer to a SET holding aux tags + * @param optarg - string having the ',' separated aux tags + * @param msgheader - string to be used during error output as a header +returns -1 on failure and 0 on success +moved from sam_view.c to here for common usage at different source files +*/ +int parse_aux_list(auxhash_t *h, char *optarg, const char *msgheader) +{ + if (!*h) + *h = kh_init(aux_exists); + + while (strlen(optarg) >= 2) { + int x = optarg[0]<<8 | optarg[1]; + int ret = 0; + kh_put(aux_exists, *h, x, &ret); + if (ret < 0) { + kh_destroy(aux_exists, *h); + *h = NULL; + return -1; + } + + optarg += 2; + if (*optarg == ',') // allow white-space too for easy `cat file`? + optarg++; + else if (*optarg != 0) + break; + } + + if (strlen(optarg) != 0) { + fprintf(samtools_stderr, "%s: Error parsing option, " + "auxiliary tags should be exactly two characters long.\n", msgheader ? msgheader : ""); + kh_destroy(aux_exists, *h); + *h = NULL; + return -1; + } + + return 0; +} diff --git a/samtools/sam_utils.h b/samtools/sam_utils.h new file mode 100644 index 0000000..061c2a2 --- /dev/null +++ b/samtools/sam_utils.h @@ -0,0 +1,83 @@ +/* sam_utils.c -- to hold utility functions and types + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef SAM_UTIL_H +#define SAM_UTIL_H + +#include "htslib/khash.h" +#include "htslib/sam.h" + +//this file may contain any utility functions and data types to be shared across + +/*below parse_aux_list and aux_exists are moved from sam_view.c to here for common + *usage at different source files + */ + +KHASH_SET_INIT_INT(aux_exists) //SET data type to hold aux tags +typedef khash_t(aux_exists) *auxhash_t; + +/// parse_aux_list - parses given string for aux tags which are ',' separated +/** @param h - pointer to a SET holding aux tags + * @param optarg - string having the ',' separated aux tags + * @param msgheader - string to be used during error output as a header +returns -1 on failure and 0 on success +moved from sam_view.c to here for common usage at different source files +*/ +int parse_aux_list(auxhash_t *h, char *optarg, const char *msgheader); + + +// below utility function declarations moved from samtools.h to here and this header is included in samtools.h + +#define CHECK_PRINTF(fmt,args) HTS_FORMAT(HTS_PRINTF_FMT, (fmt), (args)) + +void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); +void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); + +void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp); + +/* Utility functions to register an output htsFile/samFile/vcfFile that + * might be stdout. If FNAME is "-" or NULL, records FP so that print_error() + * et al can automatically flush it before printing an error message. + */ +void autoflush_if_stdout(htsFile *fp, const char *fname); + +/* Call this before closing FP; check_sam_close() does this automatically. + */ +void release_autoflush(htsFile *fp); + +/* + * Utility function to add an index to a file we've opened for write. + * NB: Call this after writing the header and before writing sequences. + * + * The returned index filename should be freed by the caller, but only + * after sam_idx_save has been called. + * + * Returns index filename on success, + * NULL on failure. + */ +char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header); + +#endif //SAM_UTIL_H + + diff --git a/samtools/sam_view.c b/samtools/sam_view.c index d60d8f7..d23e965 100644 --- a/samtools/sam_view.c +++ b/samtools/sam_view.c @@ -1,6 +1,6 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2022 Genome Research Ltd. + Copyright (C) 2009-2023 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -43,13 +43,11 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_opts.h" #include "bam.h" // for bam_get_library and bam_remove_B #include "bedidx.h" +#include "sam_utils.h" KHASH_SET_INIT_STR(str) typedef khash_t(str) *strhash_t; -KHASH_SET_INIT_INT(aux_exists) -typedef khash_t(aux_exists) *auxhash_t; - // This structure contains the settings for a samview run typedef struct samview_settings { strhash_t rghash; @@ -89,6 +87,8 @@ typedef struct samview_settings { char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx; int fetch_pairs, nreglist; hts_reglist_t *reglist; + int sanitize; + int count_rf; // CRAM_OPT_REQUIRED_FIELDS for view -c } samview_settings_t; // Copied from htslib/sam.c. @@ -392,33 +392,6 @@ static inline void change_flag(bam1_t *b, samview_settings_t *settings) b->core.flag &= ~settings->remove_flag; } -int parse_aux_list(auxhash_t *h, char *optarg) { - if (!*h) - *h = kh_init(aux_exists); - - while (strlen(optarg) >= 2) { - int x = optarg[0]<<8 | optarg[1]; - int ret = 0; - kh_put(aux_exists, *h, x, &ret); - if (ret < 0) - return -1; - - optarg += 2; - if (*optarg == ',') // allow white-space too for easy `cat file`? - optarg++; - else if (*optarg != 0) - break; - } - - if (strlen(optarg) != 0) { - fprintf(stderr, "main_samview: Error parsing option, " - "auxiliary tags should be exactly two characters long.\n"); - return -1; - } - - return 0; -} - static int cmp_reglist_intervals(const void *aptr, const void *bptr) { hts_pair_pos_t *a = (hts_pair_pos_t*)aptr; @@ -692,6 +665,10 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * // Common code for processing and writing a record static inline int process_one_record(samview_settings_t *conf, bam1_t *b, int *write_error) { + if (conf->sanitize) + if (bam_sanitize(conf->header, b, conf->sanitize) < 0) + return -1; + if (!process_aln(conf->header, b, conf)) { if (!conf->is_count) { change_flag(b, conf); @@ -784,6 +761,13 @@ static int is_sam(const char *fn) { return (l >= 4 && strcasecmp(fn + l-4, ".sam") == 0); } +static void aux_list_free(samview_settings_t *settings) { + if (settings->keep_tag) + kh_destroy(aux_exists, settings->keep_tag); + if (settings->remove_tag) + kh_destroy(aux_exists, settings->remove_tag); +} + int main_samview(int argc, char *argv[]) { samview_settings_t settings; @@ -797,6 +781,7 @@ int main_samview(int argc, char *argv[]) memset(&settings,0,sizeof(settings)); settings.subsam_frac = -1.0; + settings.count_rf = SAM_FLAG; // don't want 0, and this is quick static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), @@ -851,6 +836,7 @@ int main_samview(int argc, char *argv[]) {"unoutput", required_argument, NULL, 'U'}, {"use-index", no_argument, NULL, 'M'}, {"with-header", no_argument, NULL, 'h'}, + {"sanitize", required_argument, NULL, 'z'}, }; /* parse command-line options */ @@ -867,7 +853,7 @@ int main_samview(int argc, char *argv[]) char *tmp; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pP", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pPz:", lopts, NULL)) >= 0) { switch (c) { case 's': @@ -883,6 +869,7 @@ int main_samview(int argc, char *argv[]) print_error("view", "Incorrect sampling argument \"%s\"", optarg); goto view_end; } + settings.count_rf |= SAM_QNAME; break; case LONGOPT('s'): settings.subsam_frac = strtod(optarg, &tmp); @@ -890,9 +877,13 @@ int main_samview(int argc, char *argv[]) print_error("view", "Incorrect sampling argument \"%s\"", optarg); goto view_end; } + settings.count_rf |= SAM_QNAME; break; case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; - case 'm': settings.min_qlen = atoi(optarg); break; + case 'm': + settings.min_qlen = atoi(optarg); + settings.count_rf |= SAM_SEQ; + break; case 'c': settings.is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; @@ -904,17 +895,40 @@ int main_samview(int argc, char *argv[]) case 'o': settings.fn_out = strdup(optarg); break; case 'U': settings.fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= bam_str2flag(optarg); break; - case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case 'f': + settings.flag_on |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; + case 'F': + settings.flag_off |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; case LONGOPT('g'): - settings.flag_anyon |= bam_str2flag(optarg); break; - case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; - case 'q': settings.min_mapQ = atoi(optarg); break; + settings.flag_anyon |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; + case 'G': + settings.flag_alloff |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; + case 'q': + settings.min_mapQ = atoi(optarg); + settings.count_rf |= SAM_MAPQ; + break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; - case 'l': settings.library = strdup(optarg); break; + case 'l': + settings.library = strdup(optarg); + settings.count_rf |= SAM_RGAUX; + break; case 'p': settings.unmap = 1; break; case 'P': settings.fetch_pairs = 1; settings.multi_region = 1; break; + case 'z': + if ((settings.sanitize = bam_sanitize_options(optarg)) < 0) { + ret = 1; + goto view_end; + } + break; case LONGOPT('L'): settings.multi_region = 1; // fall through @@ -924,28 +938,33 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR; break; case 'r': if (add_read_group_single("view", &settings, optarg) != 0) { ret = 1; goto view_end; } + settings.count_rf |= SAM_RGAUX; break; case 'R': if (add_read_groups_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } + settings.count_rf |= SAM_RGAUX; break; case 'N': if (add_read_names_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } + settings.count_rf |= SAM_QNAME; break; + case 'd': if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) { - print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); + print_error("view", "Invalid \"tag:value\" option: \"%s\"", optarg); ret = 1; goto view_end; } @@ -970,13 +989,22 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + // Some tag filtering affects other fields + if (memcmp(settings.tag, "NM", 2) == 0 || + memcmp(settings.tag, "MD", 2) == 0) + settings.count_rf |= SAM_AUX | SAM_SEQ; + else if (memcmp(settings.tag, "RG", 2) == 0) + settings.count_rf |= SAM_RGAUX; + else + settings.count_rf |= SAM_AUX; break; + case 'D': // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX // path translation as described at: // http://www.mingw.org/wiki/Posix_path_conversion if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { - print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); + print_error("view", "Invalid \"tag:file\" option: \"%s\"", optarg); ret = 1; goto view_end; } @@ -1000,7 +1028,16 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + // Some tag filtering affects other fields + if (memcmp(settings.tag, "NM", 2) == 0 || + memcmp(settings.tag, "MD", 2) == 0) + settings.count_rf |= SAM_AUX | SAM_SEQ; + else if (memcmp(settings.tag, "RG", 2) == 0) + settings.count_rf |= SAM_RGAUX; + else + settings.count_rf |= SAM_AUX; break; + case LONGOPT('?'): return usage(stdout, EXIT_SUCCESS, 1); case '?': @@ -1029,23 +1066,30 @@ int main_samview(int argc, char *argv[]) print_error("main_samview", "Couldn't initialise filter"); return 1; } + settings.count_rf = INT_MAX; // no way to know what we need break; case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; case 'x': if (*optarg == '^') { - if (parse_aux_list(&settings.keep_tag, optarg+1)) + if (parse_aux_list(&settings.keep_tag, optarg+1, "main_samview")) { + aux_list_free(&settings); return usage(stderr, EXIT_FAILURE, 0); + } } else { - if (parse_aux_list(&settings.remove_tag, optarg)) + if (parse_aux_list(&settings.remove_tag, optarg, "main_samview")) { + aux_list_free(&settings); return usage(stderr, EXIT_FAILURE, 0); + } } break; case LONGOPT('x'): - if (parse_aux_list(&settings.keep_tag, optarg)) + if (parse_aux_list(&settings.keep_tag, optarg, "main_samview")) { + aux_list_free(&settings); return usage(stderr, EXIT_FAILURE, 0); + } break; default: @@ -1221,7 +1265,7 @@ int main_samview(int argc, char *argv[]) settings.unmap = 0; // Not valid in counting mode } - if (ga.nthreads > 1) { + if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { fprintf(stderr, "Error creating thread pool\n"); ret = 1; @@ -1236,13 +1280,20 @@ int main_samview(int argc, char *argv[]) // Initialize BAM/CRAM index char **regs = NULL; int nregs = 0; - if ( has_index_file && optind < argc - 2 ) regs = &argv[optind+2], nregs = argc - optind - 2, settings.fn_idx_in = argv[optind+1]; - else if ( !has_index_file && optind < argc - 1 ) regs = &argv[optind+1], nregs = argc - optind - 1; - else if ( has_index_file ) - { + if ( has_index_file && optind <= argc - 2 ) { + regs = optind < argc-2 ? &argv[optind+2] : NULL; + nregs = argc - optind - 2; + settings.fn_idx_in = argv[optind+1]; + } else if (!has_index_file && optind < argc - 1 ) { + regs = &argv[optind+1]; + nregs = argc - optind - 1; + } else if ( has_index_file && argc-optind < 2) { print_error("view", "Incorrect number of arguments for -X option. Aborting."); return 1; } + if (regs) + settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR; + if ( settings.fn_idx_in || nregs || settings.multi_region ) { settings.hts_idx = settings.fn_idx_in ? sam_index_load2(settings.in, settings.fn_in, settings.fn_idx_in) : sam_index_load(settings.in, settings.fn_in); @@ -1253,6 +1304,10 @@ int main_samview(int argc, char *argv[]) } } + if (settings.is_count) + // Won't fail, but also wouldn't matter if it did + hts_set_opt(settings.in, CRAM_OPT_REQUIRED_FIELDS, settings.count_rf); + if ( settings.fetch_pairs ) { hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs); @@ -1265,8 +1320,8 @@ int main_samview(int argc, char *argv[]) ret = iter ? multi_region_view(&settings, iter) : 1; if (ret) goto view_end; } - else if ( !settings.hts_idx ) // stream through the entire file - { + else if ( !settings.hts_idx || optind+1 >= argc-has_index_file ) { + // stream through the entire file ret = stream_view(&settings); if (ret) goto view_end; } else { // retrieve alignments in specified regions @@ -1352,10 +1407,7 @@ view_end: free(settings.fn_un_out_idx); free(arg_list); - if (settings.keep_tag) - kh_destroy(aux_exists, settings.keep_tag); - if (settings.remove_tag) - kh_destroy(aux_exists, settings.remove_tag); + aux_list_free(&settings); return ret; } @@ -1417,6 +1469,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " Comma-separated read tags to preserve (repeatable) [null].\n" " Equivalent to \"-x ^STR\"\n" " -B, --remove-B Collapse the backward CIGAR operation\n" +" -z, --sanitize FLAGS Perform sanitity checking and fixing on records.\n" +" FLAGS is comma separated (see manual). [off]\n" "\n" "General options:\n" " -?, --help Print long help, including note about region specification\n" diff --git a/samtools/sam_view.c.pysam.c b/samtools/sam_view.c.pysam.c index bb61059..7961862 100644 --- a/samtools/sam_view.c.pysam.c +++ b/samtools/sam_view.c.pysam.c @@ -2,7 +2,7 @@ /* sam_view.c -- SAM<->BAM<->CRAM conversion. - Copyright (C) 2009-2022 Genome Research Ltd. + Copyright (C) 2009-2023 Genome Research Ltd. Portions copyright (C) 2009, 2011, 2012 Broad Institute. Author: Heng Li @@ -45,13 +45,11 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_opts.h" #include "bam.h" // for bam_get_library and bam_remove_B #include "bedidx.h" +#include "sam_utils.h" KHASH_SET_INIT_STR(str) typedef khash_t(str) *strhash_t; -KHASH_SET_INIT_INT(aux_exists) -typedef khash_t(aux_exists) *auxhash_t; - // This structure contains the settings for a samview run typedef struct samview_settings { strhash_t rghash; @@ -91,6 +89,8 @@ typedef struct samview_settings { char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx; int fetch_pairs, nreglist; hts_reglist_t *reglist; + int sanitize; + int count_rf; // CRAM_OPT_REQUIRED_FIELDS for view -c } samview_settings_t; // Copied from htslib/sam.c. @@ -394,33 +394,6 @@ static inline void change_flag(bam1_t *b, samview_settings_t *settings) b->core.flag &= ~settings->remove_flag; } -int parse_aux_list(auxhash_t *h, char *optarg) { - if (!*h) - *h = kh_init(aux_exists); - - while (strlen(optarg) >= 2) { - int x = optarg[0]<<8 | optarg[1]; - int ret = 0; - kh_put(aux_exists, *h, x, &ret); - if (ret < 0) - return -1; - - optarg += 2; - if (*optarg == ',') // allow white-space too for easy `cat file`? - optarg++; - else if (*optarg != 0) - break; - } - - if (strlen(optarg) != 0) { - fprintf(samtools_stderr, "main_samview: Error parsing option, " - "auxiliary tags should be exactly two characters long.\n"); - return -1; - } - - return 0; -} - static int cmp_reglist_intervals(const void *aptr, const void *bptr) { hts_pair_pos_t *a = (hts_pair_pos_t*)aptr; @@ -694,6 +667,10 @@ static int fetch_pairs_collect_mates(samview_settings_t *conf, hts_itr_multi_t * // Common code for processing and writing a record static inline int process_one_record(samview_settings_t *conf, bam1_t *b, int *write_error) { + if (conf->sanitize) + if (bam_sanitize(conf->header, b, conf->sanitize) < 0) + return -1; + if (!process_aln(conf->header, b, conf)) { if (!conf->is_count) { change_flag(b, conf); @@ -786,6 +763,13 @@ static int is_sam(const char *fn) { return (l >= 4 && strcasecmp(fn + l-4, ".sam") == 0); } +static void aux_list_free(samview_settings_t *settings) { + if (settings->keep_tag) + kh_destroy(aux_exists, settings->keep_tag); + if (settings->remove_tag) + kh_destroy(aux_exists, settings->remove_tag); +} + int main_samview(int argc, char *argv[]) { samview_settings_t settings; @@ -799,6 +783,7 @@ int main_samview(int argc, char *argv[]) memset(&settings,0,sizeof(settings)); settings.subsam_frac = -1.0; + settings.count_rf = SAM_FLAG; // don't want 0, and this is quick static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'), @@ -853,6 +838,7 @@ int main_samview(int argc, char *argv[]) {"unoutput", required_argument, NULL, 'U'}, {"use-index", no_argument, NULL, 'M'}, {"with-header", no_argument, NULL, 'h'}, + {"sanitize", required_argument, NULL, 'z'}, }; /* parse command-line options */ @@ -869,7 +855,7 @@ int main_samview(int argc, char *argv[]) char *tmp; while ((c = getopt_long(argc, argv, - "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pP", + "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pPz:", lopts, NULL)) >= 0) { switch (c) { case 's': @@ -885,6 +871,7 @@ int main_samview(int argc, char *argv[]) print_error("view", "Incorrect sampling argument \"%s\"", optarg); goto view_end; } + settings.count_rf |= SAM_QNAME; break; case LONGOPT('s'): settings.subsam_frac = strtod(optarg, &tmp); @@ -892,9 +879,13 @@ int main_samview(int argc, char *argv[]) print_error("view", "Incorrect sampling argument \"%s\"", optarg); goto view_end; } + settings.count_rf |= SAM_QNAME; break; case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break; - case 'm': settings.min_qlen = atoi(optarg); break; + case 'm': + settings.min_qlen = atoi(optarg); + settings.count_rf |= SAM_SEQ; + break; case 'c': settings.is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; @@ -906,17 +897,40 @@ int main_samview(int argc, char *argv[]) case 'o': settings.fn_out = strdup(optarg); break; case 'U': settings.fn_un_out = strdup(optarg); break; case 'X': has_index_file = 1; break; - case 'f': settings.flag_on |= bam_str2flag(optarg); break; - case 'F': settings.flag_off |= bam_str2flag(optarg); break; + case 'f': + settings.flag_on |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; + case 'F': + settings.flag_off |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; case LONGOPT('g'): - settings.flag_anyon |= bam_str2flag(optarg); break; - case 'G': settings.flag_alloff |= bam_str2flag(optarg); break; - case 'q': settings.min_mapQ = atoi(optarg); break; + settings.flag_anyon |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; + case 'G': + settings.flag_alloff |= bam_str2flag(optarg); + settings.count_rf |= SAM_FLAG | SAM_RNEXT; + break; + case 'q': + settings.min_mapQ = atoi(optarg); + settings.count_rf |= SAM_MAPQ; + break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; - case 'l': settings.library = strdup(optarg); break; + case 'l': + settings.library = strdup(optarg); + settings.count_rf |= SAM_RGAUX; + break; case 'p': settings.unmap = 1; break; case 'P': settings.fetch_pairs = 1; settings.multi_region = 1; break; + case 'z': + if ((settings.sanitize = bam_sanitize_options(optarg)) < 0) { + ret = 1; + goto view_end; + } + break; case LONGOPT('L'): settings.multi_region = 1; // fall through @@ -926,28 +940,33 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR; break; case 'r': if (add_read_group_single("view", &settings, optarg) != 0) { ret = 1; goto view_end; } + settings.count_rf |= SAM_RGAUX; break; case 'R': if (add_read_groups_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } + settings.count_rf |= SAM_RGAUX; break; case 'N': if (add_read_names_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } + settings.count_rf |= SAM_QNAME; break; + case 'd': if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) { - print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg); + print_error("view", "Invalid \"tag:value\" option: \"%s\"", optarg); ret = 1; goto view_end; } @@ -972,13 +991,22 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + // Some tag filtering affects other fields + if (memcmp(settings.tag, "NM", 2) == 0 || + memcmp(settings.tag, "MD", 2) == 0) + settings.count_rf |= SAM_AUX | SAM_SEQ; + else if (memcmp(settings.tag, "RG", 2) == 0) + settings.count_rf |= SAM_RGAUX; + else + settings.count_rf |= SAM_AUX; break; + case 'D': // Allow ";" as delimiter besides ":" to support MinGW CLI POSIX // path translation as described at: // http://www.mingw.org/wiki/Posix_path_conversion if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) { - print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg); + print_error("view", "Invalid \"tag:file\" option: \"%s\"", optarg); ret = 1; goto view_end; } @@ -1002,7 +1030,16 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + // Some tag filtering affects other fields + if (memcmp(settings.tag, "NM", 2) == 0 || + memcmp(settings.tag, "MD", 2) == 0) + settings.count_rf |= SAM_AUX | SAM_SEQ; + else if (memcmp(settings.tag, "RG", 2) == 0) + settings.count_rf |= SAM_RGAUX; + else + settings.count_rf |= SAM_AUX; break; + case LONGOPT('?'): return usage(samtools_stdout, EXIT_SUCCESS, 1); case '?': @@ -1031,23 +1068,30 @@ int main_samview(int argc, char *argv[]) print_error("main_samview", "Couldn't initialise filter"); return 1; } + settings.count_rf = INT_MAX; // no way to know what we need break; case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break; case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break; case 'x': if (*optarg == '^') { - if (parse_aux_list(&settings.keep_tag, optarg+1)) + if (parse_aux_list(&settings.keep_tag, optarg+1, "main_samview")) { + aux_list_free(&settings); return usage(samtools_stderr, EXIT_FAILURE, 0); + } } else { - if (parse_aux_list(&settings.remove_tag, optarg)) + if (parse_aux_list(&settings.remove_tag, optarg, "main_samview")) { + aux_list_free(&settings); return usage(samtools_stderr, EXIT_FAILURE, 0); + } } break; case LONGOPT('x'): - if (parse_aux_list(&settings.keep_tag, optarg)) + if (parse_aux_list(&settings.keep_tag, optarg, "main_samview")) { + aux_list_free(&settings); return usage(samtools_stderr, EXIT_FAILURE, 0); + } break; default: @@ -1223,7 +1267,7 @@ int main_samview(int argc, char *argv[]) settings.unmap = 0; // Not valid in counting mode } - if (ga.nthreads > 1) { + if (ga.nthreads > 0) { if (!(p.pool = hts_tpool_init(ga.nthreads))) { fprintf(samtools_stderr, "Error creating thread pool\n"); ret = 1; @@ -1238,13 +1282,20 @@ int main_samview(int argc, char *argv[]) // Initialize BAM/CRAM index char **regs = NULL; int nregs = 0; - if ( has_index_file && optind < argc - 2 ) regs = &argv[optind+2], nregs = argc - optind - 2, settings.fn_idx_in = argv[optind+1]; - else if ( !has_index_file && optind < argc - 1 ) regs = &argv[optind+1], nregs = argc - optind - 1; - else if ( has_index_file ) - { + if ( has_index_file && optind <= argc - 2 ) { + regs = optind < argc-2 ? &argv[optind+2] : NULL; + nregs = argc - optind - 2; + settings.fn_idx_in = argv[optind+1]; + } else if (!has_index_file && optind < argc - 1 ) { + regs = &argv[optind+1]; + nregs = argc - optind - 1; + } else if ( has_index_file && argc-optind < 2) { print_error("view", "Incorrect number of arguments for -X option. Aborting."); return 1; } + if (regs) + settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR; + if ( settings.fn_idx_in || nregs || settings.multi_region ) { settings.hts_idx = settings.fn_idx_in ? sam_index_load2(settings.in, settings.fn_in, settings.fn_idx_in) : sam_index_load(settings.in, settings.fn_in); @@ -1255,6 +1306,10 @@ int main_samview(int argc, char *argv[]) } } + if (settings.is_count) + // Won't fail, but also wouldn't matter if it did + hts_set_opt(settings.in, CRAM_OPT_REQUIRED_FIELDS, settings.count_rf); + if ( settings.fetch_pairs ) { hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs); @@ -1267,8 +1322,8 @@ int main_samview(int argc, char *argv[]) ret = iter ? multi_region_view(&settings, iter) : 1; if (ret) goto view_end; } - else if ( !settings.hts_idx ) // stream through the entire file - { + else if ( !settings.hts_idx || optind+1 >= argc-has_index_file ) { + // stream through the entire file ret = stream_view(&settings); if (ret) goto view_end; } else { // retrieve alignments in specified regions @@ -1354,10 +1409,7 @@ view_end: free(settings.fn_un_out_idx); free(arg_list); - if (settings.keep_tag) - kh_destroy(aux_exists, settings.keep_tag); - if (settings.remove_tag) - kh_destroy(aux_exists, settings.remove_tag); + aux_list_free(&settings); return ret; } @@ -1419,6 +1471,8 @@ static int usage(FILE *fp, int exit_status, int is_long_help) " Comma-separated read tags to preserve (repeatable) [null].\n" " Equivalent to \"-x ^STR\"\n" " -B, --remove-B Collapse the backward CIGAR operation\n" +" -z, --sanitize FLAGS Perform sanitity checking and fixing on records.\n" +" FLAGS is comma separated (see manual). [off]\n" "\n" "General options:\n" " -?, --help Print long help, including note about region specification\n" diff --git a/samtools/samtools.h b/samtools/samtools.h index e0f99c2..a244c66 100644 --- a/samtools/samtools.h +++ b/samtools/samtools.h @@ -1,6 +1,6 @@ /* samtools.h -- utility routines. - Copyright (C) 2013-2015, 2019 Genome Research Ltd. + Copyright (C) 2013-2015, 2019, 2023 Genome Research Ltd. Author: Petr Danecek @@ -27,36 +27,28 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts_defs.h" #include "htslib/sam.h" +#include "sam_utils.h" const char *samtools_version(void); -#define CHECK_PRINTF(fmt,args) HTS_FORMAT(HTS_PRINTF_FMT, (fmt), (args)) - -void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); -void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3); - -void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp); - -/* Utility functions to register an output htsFile/samFile/vcfFile that - * might be stdout. If FNAME is "-" or NULL, records FP so that print_error() - * et al can automatically flush it before printing an error message. - */ -void autoflush_if_stdout(htsFile *fp, const char *fname); - -/* Call this before closing FP; check_sam_close() does this automatically. - */ -void release_autoflush(htsFile *fp); - -/* - * Utility function to add an index to a file we've opened for write. - * NB: Call this after writing the header and before writing sequences. - * - * The returned index filename should be freed by the caller, but only - * after sam_idx_save has been called. - * - * Returns index filename on success, - * NULL on failure. - */ -char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header); +/* BAM sanitizer options */ +#define FIX_POS 2 +#define FIX_MQUAL 4 +#define FIX_UNMAP 8 +#define FIX_CIGAR 16 +#define FIX_AUX 32 + +// default for position sorted data +#define FIX_ON (FIX_MQUAL|FIX_UNMAP|FIX_CIGAR|FIX_AUX) +#define FIX_ALL 255 + +// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux" +// keywords for the bam sanitizer. +int bam_sanitize_options(const char *str); + +// Sanitize a BAM record, using FIX_* bit flags as defined above. +// Returns 0 on success, +// <0 on failure. +int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags); #endif diff --git a/samtools/stats.c b/samtools/stats.c index 55a6465..06802b1 100644 --- a/samtools/stats.c +++ b/samtools/stats.c @@ -1,6 +1,6 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -180,6 +180,7 @@ typedef struct uint64_t *insertions, *deletions; uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; isize_t *isize; + uint64_t* mapping_qualities; // The extremes encountered int max_len; // Maximum read length @@ -537,11 +538,24 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) } } -void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos) +void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos, hts_pos_t end) { int i; hts_pos_t fai_ref_len; - char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); + char *fai_ref; + + if (end < pos+stats->mrseq_buf-1) + end = pos+stats->mrseq_buf-1; + else if (stats->mrseq_buf < end - pos) { + size_t sz = end - pos; + uint8_t *new_rseq = realloc(stats->rseq_buf, sz); + if (!new_rseq) + error("Couldn't expand the reference sequence buffer\n"); + stats->rseq_buf = new_rseq; + stats->mrseq_buf = sz; + } + + fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid)); uint8_t *ptr = stats->rseq_buf; @@ -1198,6 +1212,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair stats->max_len_1st = read_len; if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len ) stats->max_len_2nd = read_len; + if ( ( bam_line->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FSUPPLEMENTARY|BAM_FQCFAIL|BAM_FDUP) ) == 0 ) + stats->mapping_qualities[bam_line->core.qual]++; int i; int gc_count = 0; @@ -1328,16 +1344,19 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair // 20kbp, so the effect is negligible. if ( stats->info->fai ) { - int inc_ref = 0, inc_gcd = 0; - // First pass or new chromosome - if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; } - // Read goes beyond the end of the rseq buffer - else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; } + hts_pos_t inc_ref = 0; + int inc_gcd = 0; + // First pass or new chromosome, or read goes beyond the rseq buffer + if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid + || stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen) { + inc_ref=bam_line->core.pos+readlen; + inc_gcd=1; + } // Read overlaps the next gcd bin else if ( stats->gcd_pos+stats->info->gcd_bin_size < bam_line->core.pos+readlen ) { inc_gcd = 1; - if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = 1; + if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = bam_line->core.pos+stats->info->gcd_bin_size; } if ( inc_gcd ) { @@ -1345,7 +1364,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair if ( stats->igcd >= stats->ngcd ) realloc_gcd_buffer(stats, readlen); if ( inc_ref ) - read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); + read_ref_seq(stats, bam_line->core.tid, + bam_line->core.pos, inc_ref); stats->gcd_pos = bam_line->core.pos; stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->info->gcd_bin_size); } @@ -1460,7 +1480,7 @@ float gcd_percentile(gc_depth_t *gcd, int N, int p) void output_stats(FILE *to, stats_t *stats, int sparse) { // Calculate average insert size and standard deviation (from the main bulk data only) - int isize, ibulk=0, icov; + int isize, ibulk=0, icov, imapq=0; uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0, cov_sum=0; double bulk=0, avg_isize=0, sd_isize=0; for (isize=0; isizeisize->nitems(stats->isize->data); isize++) @@ -1768,6 +1788,13 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "LRL\t%d\t%ld\n", ilen+1, (long)stats->read_lengths_2nd[ilen+1]); } + fprintf(to, "# Mapping qualities for reads !(UNMAP|SECOND|SUPPL|QCFAIL|DUP). Use `grep ^MAPQ | cut -f 2-` to extract this part. The columns are: mapq, count\n"); + for (imapq=0; imapq < 256; imapq++) + { + if ( stats->mapping_qualities[imapq]>0 ) + fprintf(to, "MAPQ\t%d\t%ld\n", imapq, (long)stats->mapping_qualities[imapq]); + } + fprintf(to, "# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); for (ilen=0; ilennindels; ilen++) @@ -2128,6 +2155,7 @@ void cleanup_stats(stats_t* stats) destroy_regions(stats); if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash); free(stats->split_name); + free(stats->mapping_qualities); free(stats); } @@ -2317,6 +2345,8 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr if (!stats->del_cycles_1st) goto nomem; stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); if (!stats->del_cycles_2nd) goto nomem; + stats->mapping_qualities = calloc(256,sizeof(uint64_t)); + if(!stats->mapping_qualities) goto nomem; if (init_barcode_tags(stats) < 0) goto nomem; realloc_rseq_buffer(stats); diff --git a/samtools/stats.c.pysam.c b/samtools/stats.c.pysam.c index 5158827..c333195 100644 --- a/samtools/stats.c.pysam.c +++ b/samtools/stats.c.pysam.c @@ -2,7 +2,7 @@ /* stats.c -- This is the former bamcheck integrated into samtools/htslib. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Author: Petr Danecek Author: Sam Nicholls @@ -182,6 +182,7 @@ typedef struct uint64_t *insertions, *deletions; uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; isize_t *isize; + uint64_t* mapping_qualities; // The extremes encountered int max_len; // Maximum read length @@ -539,11 +540,24 @@ void count_mismatches_per_cycle(stats_t *stats, bam1_t *bam_line, int read_len) } } -void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos) +void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos, hts_pos_t end) { int i; hts_pos_t fai_ref_len; - char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); + char *fai_ref; + + if (end < pos+stats->mrseq_buf-1) + end = pos+stats->mrseq_buf-1; + else if (stats->mrseq_buf < end - pos) { + size_t sz = end - pos; + uint8_t *new_rseq = realloc(stats->rseq_buf, sz); + if (!new_rseq) + error("Couldn't expand the reference sequence buffer\n"); + stats->rseq_buf = new_rseq; + stats->mrseq_buf = sz; + } + + fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len); if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid)); uint8_t *ptr = stats->rseq_buf; @@ -1200,6 +1214,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair stats->max_len_1st = read_len; if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len ) stats->max_len_2nd = read_len; + if ( ( bam_line->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FSUPPLEMENTARY|BAM_FQCFAIL|BAM_FDUP) ) == 0 ) + stats->mapping_qualities[bam_line->core.qual]++; int i; int gc_count = 0; @@ -1330,16 +1346,19 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair // 20kbp, so the effect is negligible. if ( stats->info->fai ) { - int inc_ref = 0, inc_gcd = 0; - // First pass or new chromosome - if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; } - // Read goes beyond the end of the rseq buffer - else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; } + hts_pos_t inc_ref = 0; + int inc_gcd = 0; + // First pass or new chromosome, or read goes beyond the rseq buffer + if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid + || stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen) { + inc_ref=bam_line->core.pos+readlen; + inc_gcd=1; + } // Read overlaps the next gcd bin else if ( stats->gcd_pos+stats->info->gcd_bin_size < bam_line->core.pos+readlen ) { inc_gcd = 1; - if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = 1; + if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = bam_line->core.pos+stats->info->gcd_bin_size; } if ( inc_gcd ) { @@ -1347,7 +1366,8 @@ void collect_stats(bam1_t *bam_line, stats_t *stats, khash_t(qn2pair) *read_pair if ( stats->igcd >= stats->ngcd ) realloc_gcd_buffer(stats, readlen); if ( inc_ref ) - read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); + read_ref_seq(stats, bam_line->core.tid, + bam_line->core.pos, inc_ref); stats->gcd_pos = bam_line->core.pos; stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->info->gcd_bin_size); } @@ -1462,7 +1482,7 @@ float gcd_percentile(gc_depth_t *gcd, int N, int p) void output_stats(FILE *to, stats_t *stats, int sparse) { // Calculate average insert size and standard deviation (from the main bulk data only) - int isize, ibulk=0, icov; + int isize, ibulk=0, icov, imapq=0; uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0, cov_sum=0; double bulk=0, avg_isize=0, sd_isize=0; for (isize=0; isizeisize->nitems(stats->isize->data); isize++) @@ -1770,6 +1790,13 @@ void output_stats(FILE *to, stats_t *stats, int sparse) fprintf(to, "LRL\t%d\t%ld\n", ilen+1, (long)stats->read_lengths_2nd[ilen+1]); } + fprintf(to, "# Mapping qualities for reads !(UNMAP|SECOND|SUPPL|QCFAIL|DUP). Use `grep ^MAPQ | cut -f 2-` to extract this part. The columns are: mapq, count\n"); + for (imapq=0; imapq < 256; imapq++) + { + if ( stats->mapping_qualities[imapq]>0 ) + fprintf(to, "MAPQ\t%d\t%ld\n", imapq, (long)stats->mapping_qualities[imapq]); + } + fprintf(to, "# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); for (ilen=0; ilennindels; ilen++) @@ -2130,6 +2157,7 @@ void cleanup_stats(stats_t* stats) destroy_regions(stats); if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash); free(stats->split_name); + free(stats->mapping_qualities); free(stats); } @@ -2319,6 +2347,8 @@ static void init_stat_structs(stats_t* stats, stats_info_t* info, const char* gr if (!stats->del_cycles_1st) goto nomem; stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); if (!stats->del_cycles_2nd) goto nomem; + stats->mapping_qualities = calloc(256,sizeof(uint64_t)); + if(!stats->mapping_qualities) goto nomem; if (init_barcode_tags(stats) < 0) goto nomem; realloc_rseq_buffer(stats); diff --git a/samtools/version.sh b/samtools/version.sh index 5327353..1ac9413 100755 --- a/samtools/version.sh +++ b/samtools/version.sh @@ -24,7 +24,7 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.16.1 +VERSION=1.17 # If we have a git clone, then check against the current tag if [ -e .git ] diff --git a/setup.py b/setup.py index 890b90a..291d0f9 100644 --- a/setup.py +++ b/setup.py @@ -170,6 +170,8 @@ def set_compiler_envvars(): tmp_vars = [] for var in ['CC', 'CFLAGS', 'LDFLAGS']: if var in os.environ: + if var == 'CFLAGS' and 'CCSHARED' in sysconfig.get_config_vars(): + os.environ[var] += ' ' + sysconfig.get_config_var('CCSHARED') print("# pysam: (env) {}={}".format(var, os.environ[var])) elif var in sysconfig.get_config_vars(): value = sysconfig.get_config_var(var) @@ -476,9 +478,7 @@ else: define_macros = [] -suffix = sysconfig.get_config_var('EXT_SUFFIX') -if not suffix: - suffix = sysconfig.get_config_var('SO') +suffix = sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO') internal_htslib_libraries = [ os.path.splitext("chtslib{}".format(suffix))[0]] diff --git a/tests/AlignedSegment_test.py b/tests/AlignedSegment_test.py index 400425f..1dc72d5 100644 --- a/tests/AlignedSegment_test.py +++ b/tests/AlignedSegment_test.py @@ -264,7 +264,7 @@ class TestAlignedSegment(ReadTest): a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 200 - return a + self.assertTrue(a) def testUpdateTlen(self): """check if updating tlen works""" diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py index 8e75a52..083b6e5 100644 --- a/tests/AlignmentFilePileup_test.py +++ b/tests/AlignmentFilePileup_test.py @@ -1,6 +1,7 @@ """Benchmarking module for AlignmentFile functionality""" import os import pysam +import sys import unittest from TestUtils import make_data_files, BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list import PileupTestUtils @@ -206,6 +207,7 @@ class TestPileupObjects(unittest.TestCase): def tearDown(self): self.samfile.close() + @unittest.skipIf(sys.version_info >= (3, 11), "exercises invalid accesses, which crashes with Python 3.11") def testIteratorOutOfScope(self): '''test if exception is raised if pileup col is accessed after iterator is exhausted.''' diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 61531f4..4107c64 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -2008,8 +2008,8 @@ class TestFindIntrons(unittest.TestCase): def test_total(self): all_read_counts = self.samfile.count() splice_sites = self.samfile.find_introns(self.samfile.fetch()) - # there is a single unspliced read in there - self.assertEqual(sum(splice_sites.values()), all_read_counts - 1) + # there is a single unspliced read and a single unmapped read in there + self.assertEqual(sum(splice_sites.values()), all_read_counts - 2) def test_first(self): reads = list(self.samfile.fetch())[:10] diff --git a/tests/compile_test.py b/tests/compile_test.py index 300ab92..2ea3343 100644 --- a/tests/compile_test.py +++ b/tests/compile_test.py @@ -8,7 +8,7 @@ pysam and tabix works. # clean up previous compilation import os -import unittest +import pytest import pysam from TestUtils import make_data_files, BAM_DATADIR, TABIX_DATADIR @@ -24,31 +24,29 @@ try: except OSError: pass -import pyximport -pyximport.install(build_in_temp=False) -import _compile_test +NO_PYXIMPORT = False +try: + import pyximport + pyximport.install(build_in_temp=False) + import _compile_test +except: + NO_PYXIMPORT = True -class BAMTest(unittest.TestCase): +@pytest.mark.skipif(NO_PYXIMPORT, reason="no pyximport") +def test_bam(): input_filename = os.path.join(BAM_DATADIR, "ex1.bam") - - def testCount(self): - - nread = _compile_test.testCountBAM( - pysam.Samfile(self.input_filename)) - self.assertEqual(nread, 3270) + nread = _compile_test.testCountBAM( + pysam.Samfile(input_filename)) + assert nread == 3270 -class GTFTest(unittest.TestCase): +@pytest.mark.skipif(NO_PYXIMPORT, reason="no pyximport") +def test_gtf(): input_filename = os.path.join(TABIX_DATADIR, "example.gtf.gz") - def testCount(self): - nread = _compile_test.testCountGTF( - pysam.Tabixfile(self.input_filename)) - self.assertEqual(nread, 237) - - -if __name__ == "__main__": - unittest.main() + nread = _compile_test.testCountGTF( + pysam.Tabixfile(input_filename)) + assert nread == 237 diff --git a/tests/pysam_data/ex_spliced.sam b/tests/pysam_data/ex_spliced.sam index ae8086a..65147ef 100644 --- a/tests/pysam_data/ex_spliced.sam +++ b/tests/pysam_data/ex_spliced.sam @@ -284,6 +284,7 @@ HWI-C00113:131:HMHYWADXX:1:2103:9695:24819 272 1 17031 0 25M177N26M * 0 0 GCACAT HWI-C00113:131:HMHYWADXX:2:1204:13994:2816 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC ?@@DDDDDHHHHFHEFAABA@?FGBEFHIIIHH>DB@DHIHIDD>@@GHID NH:i:7 HI:i:6 AS:i:49 nM:i:0 HWI-C00113:131:HMHYWADXX:2:1212:15591:47491 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII NH:i:7 HI:i:6 AS:i:49 nM:i:0 HWI-C00113:131:HMHYWADXX:2:2215:10125:81395 272 1 17031 0 25M859N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE NH:i:6 HI:i:1 AS:i:49 nM:i:0 +HWI-C00113:131:HMHYWADXX:PLACED:UNMAPPED 4 1 17032 0 * * 0 0 ATGC HIJK HWI-C00113:131:HMHYWADXX:1:2102:9065:90529 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG C@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ NH:i:5 HI:i:2 AS:i:47 nM:i:0 HWI-C00113:131:HMHYWADXX:1:2204:7767:77376 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD@FGFHGIH NH:i:5 HI:i:2 AS:i:47 nM:i:0 HWI-C00113:131:HMHYWADXX:2:1212:6793:42000 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@?DADBD8CFADGFHIIIIE3A9?