Release notes
=============
+Release 0.21.0
+==============
+
+This release wraps htslib/samtools/bcftools version 1.17.
+
+Pysam is now compatible with Python 3.11. We have removed python 2.x
+support. Pysam is tested with python versions 3.6 to 3.11.
+
+* [#1175] VariantHeader.new_record: set start/stop before alleles
+* [#1173] Add multiple build improvements in htscodecs on multi-arch macOS
+* [#1148] Ignore CIGAR-less reads in find_introns.
+* [#1172] Add new `samtools cram-size` and `samtools reset` commands
+* [#1169] Fix CRAM index-related crash when using the musl C standard library.
+* [#1168] Add a minimal pyproject.toml for PEP517.
+* [#1158] Fix type hints and add FastqProxy type hints.
+* [#1147] Py3.11 compatibility, get shared object suffix from EXT_SUFFIX.
+* [#1143] Add mypy symbols for samtools and bcftools.
+* [#1155] Fix pysam.index() when using recent `samtools index` options.
+* [#1151] Test suite py3.11 compatibility, work around failing test case.
+* [#1149] MacOS universal build compatibility.
+* [#1146] Fix build when CFLAGS/etc environment variables are set.
+
Release 0.20.0
==============
compilation options. Especially for OS X this will potentially save a
lot of trouble.
-The current version of pysam wraps 3rd-party code from htslib-1.16, samtools-1.16.1, and bcftools-1.16.
+The current version of pysam wraps 3rd-party code from htslib-1.17, samtools-1.17, and bcftools-1.17.
Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
The MIT/Expat License
-Copyright (C) 2012-2021 Genome Research Ltd.
+Copyright (C) 2012-2023 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
/* The MIT License
- Copyright (c) 2021-2022 Genome Research Ltd.
+ Copyright (c) 2021-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
assert(atom);
if ( altb!='-' ) kputc(altb, &atom->alt);
if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
+ continue;
}
- else
+ buf->natoms++;
+ hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+ atom = &buf->atoms[buf->natoms-1];
+ atom->ref.l = 0;
+ atom->alt.l = 0;
+ kputc(refb, &atom->ref);
+ kputc(altb, &atom->alt);
+ atom->beg = atom->end = i;
+ atom->ial = ial;
+
+ if ( rlen!=alen && (i+1>=rlen || i+1>=alen) ) // the next base is an indel combined with SNV, e.g. C>GGG?
{
buf->natoms++;
hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
atom->ref.l = 0;
atom->alt.l = 0;
kputc(refb, &atom->ref);
- kputc(altb, &atom->alt);
+ kputc(refb, &atom->alt);
atom->beg = atom->end = i;
atom->ial = ial;
}
continue;
}
- if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion?
+ if ( i+1>=rlen || i+1>=alen ) // is the next base an indel?
{
buf->natoms++;
hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
_split_table_overlap(buf, j, atom);
}
}
+ // _split_table_print(buf);
+ // _split_table_print_atoms(buf);
assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
// Create the output records, transferring all annotations:
/* The MIT License
- Copyright (c) 2021-2022 Genome Research Ltd.
+ Copyright (c) 2021-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
assert(atom);
if ( altb!='-' ) kputc(altb, &atom->alt);
if ( refb!='-' ) { kputc(refb, &atom->ref); atom->end++; }
+ continue;
}
- else
+ buf->natoms++;
+ hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
+ atom = &buf->atoms[buf->natoms-1];
+ atom->ref.l = 0;
+ atom->alt.l = 0;
+ kputc(refb, &atom->ref);
+ kputc(altb, &atom->alt);
+ atom->beg = atom->end = i;
+ atom->ial = ial;
+
+ if ( rlen!=alen && (i+1>=rlen || i+1>=alen) ) // the next base is an indel combined with SNV, e.g. C>GGG?
{
buf->natoms++;
hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
atom->ref.l = 0;
atom->alt.l = 0;
kputc(refb, &atom->ref);
- kputc(altb, &atom->alt);
+ kputc(refb, &atom->alt);
atom->beg = atom->end = i;
atom->ial = ial;
}
continue;
}
- if ( i+1>=rlen || i+1>=alen ) // is the next base a deletion?
+ if ( i+1>=rlen || i+1>=alen ) // is the next base an indel?
{
buf->natoms++;
hts_expand0(atom_t,buf->natoms,buf->matoms,buf->atoms);
_split_table_overlap(buf, j, atom);
}
}
+ // _split_table_print(buf);
+ // _split_table_print_atoms(buf);
assert( !buf->rbuf.n ); // all records should be flushed first in the SPLIT mode
// Create the output records, transferring all annotations:
return bca;
}
+void bcf_iaux_destroy(bcf_callaux_t *bca);
void bcf_call_destroy(bcf_callaux_t *bca)
{
if (bca == 0) return;
+ bcf_iaux_destroy(bca);
errmod_destroy(bca->e);
if (bca->npos) {
free(bca->ref_pos); free(bca->alt_pos);
free(bca->bases); free(bca->inscns); free(bca);
}
-static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref)
+static int get_aux_nm(const bam_pileup1_t *p, int32_t qpos, int is_ref)
{
- uint8_t *nm_tag = bam_aux_get(rec, "NM");
- if ( !nm_tag ) return -1;
- int64_t nm = bam_aux2i(nm_tag);
+ int64_t nm;
+ const bam_pileup_cd *cd = &p->cd;
- // Count indels as single events, not as the number of inserted/deleted
- // bases (which is what NM does). Add soft clips as mismatches.
- int i;
- for (i=0; i < rec->core.n_cigar; i++)
+ if ( PLP_NM(cd) == -1 ) return -1;
+ if ( PLP_NM(cd) == PLP_NM_UNSET )
{
- int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
- if ( val==BAM_CSOFT_CLIP )
+ // todo: make this localized to be useful for long reads as well
+ bam1_t *rec = p->b;
+ uint8_t *nm_tag = bam_aux_get(rec, "NM");
+ if ( !nm_tag )
{
- nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ PLP_NM(cd) = -1;
+ return -1;
}
- else if ( val==BAM_CINS || val==BAM_CDEL )
+ nm = bam_aux2i(nm_tag);
+
+ // Count indels as single events, not as the number of inserted/deleted
+ // bases (which is what NM does). Add soft clips as mismatches.
+ int i;
+ for (i=0; i < rec->core.n_cigar; i++)
{
- val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
- if ( val > 1 ) nm -= val - 1;
+ int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
+ if ( val==BAM_CSOFT_CLIP )
+ {
+ nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ }
+ else if ( val==BAM_CINS || val==BAM_CDEL )
+ {
+ val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ if ( val > 1 ) nm -= val - 1;
+ }
}
+ PLP_NM(cd) = nm;
}
+ else
+ nm = PLP_NM(cd);
// Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
// http://www.genome.org/cgi/doi/10.1101/gr.239756.118
memset(bca->alt_scl, 0, 100*sizeof(int));
memset(bca->iref_scl, 0, 100*sizeof(int));
memset(bca->ialt_scl, 0, 100*sizeof(int));
+ int i;
+ for (i=0; i<2; i++) bca->nnm[i] = 0;
+ for (i=0; i<2; i++) bca->nm[i] = 0;
}
/*
int ADF_ref_missed[4] = {0};
for (i = n = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
- int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
- if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
+ int b; // the base or indel type
+ int q; // the base or indel quality used to calculate PL
+ int seqQ; // used to cap the indel quality given the sequence context
+ int mapQ; // to cap the quality for low MQ reads
+ int baseQ; // used only for supporting INFO annotations
+ int is_diff; // is this base or indel type different from the reference
+ int min_dist; // distance from the end, used for tail distance bias
+ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(&p->cd) ) r->SCR++;
if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
- if (p->is_del && !is_indel) continue;
+
+ // The meaning of the indel related variables:
+ // is_indel .. is this position currently tested for an indel
+ // p->is_del .. is the current base a deletion in this read (unrelated to the tested indel)
+ // p->indel .. is there an indel starting after this position (i.e. does this read have the tested indel)
+ if (p->is_del && !is_indel) continue; // not testing an indel and the read has a spanning deletion
+
+ int inm = -1;
+
++ori_depth;
- if (is_indel)
+ if (is_indel) // testing an indel position
{
- b = p->aux>>16&0x3f;
+ b = p->aux>>16&0x3f; // indel type
seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+
+ if ( !bca->indels_v20 )
+ {
+ /*
+ This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain
+ correct result on the provided test case even when this code is commented out, so this
+ may not be needed anymore. Leaving it in only for backward compatibility for now.
+ See mpileup-tests homdel-issue-1446 and CHM1_CHM13_2.45x-1-1701408 which work only when
+ this code is disabled.
+ */
+ if (p->indel == 0 && (q < _n/2 || _n > 20)) {
+ // high quality indel calls without p->indel set aren't
+ // particularly indicative of being a good REF match either,
+ // at least not in low coverage. So require solid coverage
+ // before we start utilising such quals.
+ b = 0;
+ q = (int)bam_get_qual(p->b)[p->qpos];
+ seqQ = (3*seqQ + 2*q)/8;
+ }
+ if (_n > 20 && seqQ > 40) seqQ = 40;
+ }
+
+ is_diff = b ? 1 : 0;
+ if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) )
+ {
+ inm = get_aux_nm(p,p->qpos,is_diff?0:1);
+ if ( inm>=0 )
+ {
+ bca->nnm[is_diff]++;
+ bca->nm[is_diff] += inm;
+ }
+ }
+
if (q < bca->min_baseQ)
{
- if (!p->indel && b < 4)
+ if (!p->indel && b < 4) // not an indel read
{
if (bam_is_rev(p->b))
ADR_ref_missed[b]++;
}
continue;
}
- if (p->indel == 0 && (q < _n/2 || _n > 20)) {
- // high quality indel calls without p->indel set aren't
- // particularly indicative of being a good REF match either,
- // at least not in low coverage. So require solid coverage
- // before we start utilising such quals.
- b = 0;
- q = (int)bam_get_qual(p->b)[p->qpos];
- seqQ = (3*seqQ + 2*q)/8;
- }
- if (_n > 20 && seqQ > 40) seqQ = 40;
baseQ = p->aux>>8&0xff;
-
- is_diff = (b != 0);
}
else
{
baseQ = q;
seqQ = 99;
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+ if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) )
+ {
+ inm = get_aux_nm(p,p->qpos,is_diff?0:1);
+ if ( inm>=0 )
+ {
+ bca->nnm[is_diff]++;
+ bca->nm[is_diff] += inm;
+ }
+ }
}
mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
if ( !mapQ ) r->mq0++;
if (q > 63) q = 63;
if (q < 4) q = 4; // MQ=0 reads count as BQ=4
bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
+ //if (is_indel) fprintf(stderr,"xx:base,q,strand\t%d\t%d\t%d\n",b,q,bam_is_rev(p->b)?0:1);
+
// collect annotations
if (b < 4)
{
if ( baseQ > 59 ) baseQ = 59;
if ( mapQ > 59 ) mapQ = 59;
int len, epos = 0, sc_len = 0, sc_dist = 0;
- if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
+ if ( bca->fmt_flag & (B2B_INFO_RPBZ|B2B_INFO_VDB|B2B_INFO_SCBZ) )
{
int pos = get_position(p, &len, &sc_len, &sc_dist);
- epos = (double)pos/(len+1) * bca->npos;
-
+ epos = (double)pos/(len+1) * (bca->npos - 1);
if (sc_len) {
- sc_len = 15.0*sc_len / sc_dist;
+ sc_len = 15.0*sc_len / (sc_dist+1);
if (sc_len > 99) sc_len = 99;
}
+ assert( epos>=0 && epos<bca->npos );
+ assert( sc_len>=0 && sc_len<bca->npos );
}
int imq = mapQ * nqual_over_60;
int ibq = baseQ * nqual_over_60;
- int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1);
if ( bam_is_rev(p->b) )
bca->rev_mqs[imq]++;
}
// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+ // fprintf(stderr,"sum_min=%f\n",sum_min);
call->shift = (int)(sum_min + .499);
}
// combine annotations
// No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
if ( !has_alt ) return 0;
- calc_SegBias(calls, call);
+ if ( bca->fmt_flag & B2B_INFO_FS )
+ {
+ double left,right,two;
+ call->strand_bias = kt_fisher_exact(call->anno[0], call->anno[1], call->anno[2], call->anno[3], &left, &right, &two);
+ }
+ if ( bca->fmt_flag & B2B_INFO_SGB ) calc_SegBias(calls, call);
// calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
// calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
// calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- // U z-normalised as +/- number of standard deviations from mean.
- if (call->ori_ref < 0) { // indel
- if (bca->fmt_flag & B2B_INFO_RPB)
- call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
- bca->npos, 0, 1);
- call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
- bca->nqual,1,1);
- if ( bca->fmt_flag & B2B_INFO_SCB )
- call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
- 100, 0,1);
- } else {
- if (bca->fmt_flag & B2B_INFO_RPB)
- call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
- bca->npos, 0, 1);
- call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
- bca->nqual,1,1);
- call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
- bca->nqual,0,1);
- call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
- bca->nqual,0,1);
- if ( bca->fmt_flag & B2B_INFO_SCB )
- call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
- 100, 0,1);
- }
+ // U z-normalised as +/- number of standard deviations from mean.
+ if (call->ori_ref < 0) { // indel
+ if ( bca->fmt_flag & B2B_INFO_RPBZ )
+ call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, bca->npos, 0, 1);
+ if ( bca->fmt_flag & B2B_INFO_MQBZ )
+ call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, bca->nqual,1,1);
+ if ( bca->fmt_flag & B2B_INFO_SCBZ )
+ call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, 100, 0,1);
+ } else {
+ if ( bca->fmt_flag & B2B_INFO_RPBZ )
+ call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, bca->npos, 0, 1);
+ if ( bca->fmt_flag & B2B_INFO_MQBZ )
+ call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, bca->nqual,1,1);
+ if ( bca->fmt_flag & B2B_INFO_BQBZ )
+ call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, bca->nqual,0,1);
+ if ( bca->fmt_flag & B2B_INFO_MQSBZ )
+ call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, bca->nqual,0,1);
+ if ( bca->fmt_flag & B2B_INFO_SCBZ )
+ call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1);
+ }
+ if ( bca->fmt_flag & B2B_INFO_NMBZ )
call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1);
- if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ {
+ for (i=0; i<n; i++)
{
- for (i=0; i<n; i++)
- {
- float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
- call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
- }
+ float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
+ call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
}
- } else {
- // Old method; U as probability between 0 and 1
- if ( bca->fmt_flag & B2B_INFO_RPB )
- call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
- bca->npos, 0, 0);
- call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
- bca->nqual, 1, 0);
- call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
- bca->nqual, 0, 0);
- call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
- bca->nqual, 0, 0);
}
-
-#if CDF_MWU_TESTS
- // CDF version of MWU tests is not calculated by default
- if ( bca->fmt_flag & B2B_INFO_RPB )
- call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
- call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
- call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
- call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
-#endif
-
if ( bca->fmt_flag & B2B_INFO_VDB )
call->vdb = calc_vdb(bca->alt_pos, bca->npos);
bc->tmp.l = 0;
// INFO
- if (bc->ori_ref < 0)
+ if ( bc->ori_ref < 0 )
{
bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
- bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
- bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
+ if ( fmt_flag&B2B_INFO_IDV )
+ bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
+ if ( fmt_flag&B2B_INFO_IMF )
+ bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
}
bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
if ( fmt_flag&B2B_INFO_ADF )
if ( has_alt )
{
- if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
- if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
- if ( bc->mwu_nm[0] != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
- if ( bc->mwu_sc != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
- } else {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ if ( fmt_flag&B2B_INFO_MIN_PL_SUM )
+ bcf_update_info_int32(hdr, rec, "MIN_PL_SUM", &bc->shift, 1);
+ if ( fmt_flag&B2B_INFO_VDB && bc->vdb != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( fmt_flag&B2B_INFO_SGB && bc->seg_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+ if ( fmt_flag&B2B_INFO_NM && (bca->nnm[0] || bca->nnm[1]) )
+ {
+ for (i=0; i<2; i++) bc->nm[i] = bca->nnm[i] ? bca->nm[i]/bca->nnm[i] : 0;
+ bcf_update_info_float(hdr, rec, "NM", bc->nm, 2);
}
- if ( bc->strand_bias != HUGE_VAL )
+ if ( fmt_flag&B2B_INFO_RPBZ && bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( fmt_flag&B2B_INFO_MQBZ && bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( fmt_flag&B2B_INFO_MQSBZ && bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( fmt_flag&B2B_INFO_BQBZ && bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( fmt_flag&B2B_INFO_NMBZ && bc->mwu_nm[0] != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
+ if ( fmt_flag&B2B_INFO_SCBZ && bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ if ( fmt_flag&B2B_INFO_FS && bc->strand_bias != HUGE_VAL )
bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
-
-#if CDF_MWU_TESTS
- if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
- if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
- if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
- if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
-#endif
}
tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
- bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
+ if ( fmt_flag&B2B_INFO_MQ0F )
+ bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
// FORMAT
rec->n_sample = bc->n;
return bca;
}
+void bcf_iaux_destroy(bcf_callaux_t *bca);
void bcf_call_destroy(bcf_callaux_t *bca)
{
if (bca == 0) return;
+ bcf_iaux_destroy(bca);
errmod_destroy(bca->e);
if (bca->npos) {
free(bca->ref_pos); free(bca->alt_pos);
free(bca->bases); free(bca->inscns); free(bca);
}
-static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref)
+static int get_aux_nm(const bam_pileup1_t *p, int32_t qpos, int is_ref)
{
- uint8_t *nm_tag = bam_aux_get(rec, "NM");
- if ( !nm_tag ) return -1;
- int64_t nm = bam_aux2i(nm_tag);
+ int64_t nm;
+ const bam_pileup_cd *cd = &p->cd;
- // Count indels as single events, not as the number of inserted/deleted
- // bases (which is what NM does). Add soft clips as mismatches.
- int i;
- for (i=0; i < rec->core.n_cigar; i++)
+ if ( PLP_NM(cd) == -1 ) return -1;
+ if ( PLP_NM(cd) == PLP_NM_UNSET )
{
- int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
- if ( val==BAM_CSOFT_CLIP )
+ // todo: make this localized to be useful for long reads as well
+ bam1_t *rec = p->b;
+ uint8_t *nm_tag = bam_aux_get(rec, "NM");
+ if ( !nm_tag )
{
- nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ PLP_NM(cd) = -1;
+ return -1;
}
- else if ( val==BAM_CINS || val==BAM_CDEL )
+ nm = bam_aux2i(nm_tag);
+
+ // Count indels as single events, not as the number of inserted/deleted
+ // bases (which is what NM does). Add soft clips as mismatches.
+ int i;
+ for (i=0; i < rec->core.n_cigar; i++)
{
- val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
- if ( val > 1 ) nm -= val - 1;
+ int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
+ if ( val==BAM_CSOFT_CLIP )
+ {
+ nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ }
+ else if ( val==BAM_CINS || val==BAM_CDEL )
+ {
+ val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ if ( val > 1 ) nm -= val - 1;
+ }
}
+ PLP_NM(cd) = nm;
}
+ else
+ nm = PLP_NM(cd);
// Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
// http://www.genome.org/cgi/doi/10.1101/gr.239756.118
memset(bca->alt_scl, 0, 100*sizeof(int));
memset(bca->iref_scl, 0, 100*sizeof(int));
memset(bca->ialt_scl, 0, 100*sizeof(int));
+ int i;
+ for (i=0; i<2; i++) bca->nnm[i] = 0;
+ for (i=0; i<2; i++) bca->nm[i] = 0;
}
/*
int ADF_ref_missed[4] = {0};
for (i = n = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
- int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
- if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(p->cd.i) ) r->SCR++;
+ int b; // the base or indel type
+ int q; // the base or indel quality used to calculate PL
+ int seqQ; // used to cap the indel quality given the sequence context
+ int mapQ; // to cap the quality for low MQ reads
+ int baseQ; // used only for supporting INFO annotations
+ int is_diff; // is this base or indel type different from the reference
+ int min_dist; // distance from the end, used for tail distance bias
+ if ( bca->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) && PLP_HAS_SOFT_CLIP(&p->cd) ) r->SCR++;
if (p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
- if (p->is_del && !is_indel) continue;
+
+ // The meaning of the indel related variables:
+ // is_indel .. is this position currently tested for an indel
+ // p->is_del .. is the current base a deletion in this read (unrelated to the tested indel)
+ // p->indel .. is there an indel starting after this position (i.e. does this read have the tested indel)
+ if (p->is_del && !is_indel) continue; // not testing an indel and the read has a spanning deletion
+
+ int inm = -1;
+
++ori_depth;
- if (is_indel)
+ if (is_indel) // testing an indel position
{
- b = p->aux>>16&0x3f;
+ b = p->aux>>16&0x3f; // indel type
seqQ = q = (p->aux & 0xff); // mp2 + builtin indel-bias
+
+ if ( !bca->indels_v20 )
+ {
+ /*
+ This heuristics was introduced by e4e161068 and claims to fix #1446. However, we obtain
+ correct result on the provided test case even when this code is commented out, so this
+ may not be needed anymore. Leaving it in only for backward compatibility for now.
+ See mpileup-tests homdel-issue-1446 and CHM1_CHM13_2.45x-1-1701408 which work only when
+ this code is disabled.
+ */
+ if (p->indel == 0 && (q < _n/2 || _n > 20)) {
+ // high quality indel calls without p->indel set aren't
+ // particularly indicative of being a good REF match either,
+ // at least not in low coverage. So require solid coverage
+ // before we start utilising such quals.
+ b = 0;
+ q = (int)bam_get_qual(p->b)[p->qpos];
+ seqQ = (3*seqQ + 2*q)/8;
+ }
+ if (_n > 20 && seqQ > 40) seqQ = 40;
+ }
+
+ is_diff = b ? 1 : 0;
+ if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) )
+ {
+ inm = get_aux_nm(p,p->qpos,is_diff?0:1);
+ if ( inm>=0 )
+ {
+ bca->nnm[is_diff]++;
+ bca->nm[is_diff] += inm;
+ }
+ }
+
if (q < bca->min_baseQ)
{
- if (!p->indel && b < 4)
+ if (!p->indel && b < 4) // not an indel read
{
if (bam_is_rev(p->b))
ADR_ref_missed[b]++;
}
continue;
}
- if (p->indel == 0 && (q < _n/2 || _n > 20)) {
- // high quality indel calls without p->indel set aren't
- // particularly indicative of being a good REF match either,
- // at least not in low coverage. So require solid coverage
- // before we start utilising such quals.
- b = 0;
- q = (int)bam_get_qual(p->b)[p->qpos];
- seqQ = (3*seqQ + 2*q)/8;
- }
- if (_n > 20 && seqQ > 40) seqQ = 40;
baseQ = p->aux>>8&0xff;
-
- is_diff = (b != 0);
}
else
{
baseQ = q;
seqQ = 99;
is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+ if ( bca->fmt_flag&(B2B_FMT_NMBZ|B2B_INFO_NMBZ|B2B_INFO_NM) )
+ {
+ inm = get_aux_nm(p,p->qpos,is_diff?0:1);
+ if ( inm>=0 )
+ {
+ bca->nnm[is_diff]++;
+ bca->nm[is_diff] += inm;
+ }
+ }
}
mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
if ( !mapQ ) r->mq0++;
if (q > 63) q = 63;
if (q < 4) q = 4; // MQ=0 reads count as BQ=4
bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
+ //if (is_indel) fprintf(bcftools_stderr,"xx:base,q,strand\t%d\t%d\t%d\n",b,q,bam_is_rev(p->b)?0:1);
+
// collect annotations
if (b < 4)
{
if ( baseQ > 59 ) baseQ = 59;
if ( mapQ > 59 ) mapQ = 59;
int len, epos = 0, sc_len = 0, sc_dist = 0;
- if ( bca->fmt_flag & (B2B_INFO_RPB|B2B_INFO_VDB|B2B_INFO_SCB) )
+ if ( bca->fmt_flag & (B2B_INFO_RPBZ|B2B_INFO_VDB|B2B_INFO_SCBZ) )
{
int pos = get_position(p, &len, &sc_len, &sc_dist);
- epos = (double)pos/(len+1) * bca->npos;
-
+ epos = (double)pos/(len+1) * (bca->npos - 1);
if (sc_len) {
- sc_len = 15.0*sc_len / sc_dist;
+ sc_len = 15.0*sc_len / (sc_dist+1);
if (sc_len > 99) sc_len = 99;
}
+ assert( epos>=0 && epos<bca->npos );
+ assert( sc_len>=0 && sc_len<bca->npos );
}
int imq = mapQ * nqual_over_60;
int ibq = baseQ * nqual_over_60;
- int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1);
if ( bam_is_rev(p->b) )
bca->rev_mqs[imq]++;
}
// if (ref_base < 0) fprintf(bcftools_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+ // fprintf(bcftools_stderr,"sum_min=%f\n",sum_min);
call->shift = (int)(sum_min + .499);
}
// combine annotations
// No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
if ( !has_alt ) return 0;
- calc_SegBias(calls, call);
+ if ( bca->fmt_flag & B2B_INFO_FS )
+ {
+ double left,right,two;
+ call->strand_bias = kt_fisher_exact(call->anno[0], call->anno[1], call->anno[2], call->anno[3], &left, &right, &two);
+ }
+ if ( bca->fmt_flag & B2B_INFO_SGB ) calc_SegBias(calls, call);
// calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
// calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
// calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- // U z-normalised as +/- number of standard deviations from mean.
- if (call->ori_ref < 0) { // indel
- if (bca->fmt_flag & B2B_INFO_RPB)
- call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
- bca->npos, 0, 1);
- call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq,
- bca->nqual,1,1);
- if ( bca->fmt_flag & B2B_INFO_SCB )
- call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl,
- 100, 0,1);
- } else {
- if (bca->fmt_flag & B2B_INFO_RPB)
- call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
- bca->npos, 0, 1);
- call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
- bca->nqual,1,1);
- call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
- bca->nqual,0,1);
- call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
- bca->nqual,0,1);
- if ( bca->fmt_flag & B2B_INFO_SCB )
- call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
- 100, 0,1);
- }
+ // U z-normalised as +/- number of standard deviations from mean.
+ if (call->ori_ref < 0) { // indel
+ if ( bca->fmt_flag & B2B_INFO_RPBZ )
+ call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos, bca->npos, 0, 1);
+ if ( bca->fmt_flag & B2B_INFO_MQBZ )
+ call->mwu_mq = calc_mwu_biasZ(bca->iref_mq, bca->ialt_mq, bca->nqual,1,1);
+ if ( bca->fmt_flag & B2B_INFO_SCBZ )
+ call->mwu_sc = calc_mwu_biasZ(bca->iref_scl, bca->ialt_scl, 100, 0,1);
+ } else {
+ if ( bca->fmt_flag & B2B_INFO_RPBZ )
+ call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos, bca->npos, 0, 1);
+ if ( bca->fmt_flag & B2B_INFO_MQBZ )
+ call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq, bca->nqual,1,1);
+ if ( bca->fmt_flag & B2B_INFO_BQBZ )
+ call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq, bca->nqual,0,1);
+ if ( bca->fmt_flag & B2B_INFO_MQSBZ )
+ call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs, bca->nqual,0,1);
+ if ( bca->fmt_flag & B2B_INFO_SCBZ )
+ call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl, 100, 0,1);
+ }
+ if ( bca->fmt_flag & B2B_INFO_NMBZ )
call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1);
- if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ {
+ for (i=0; i<n; i++)
{
- for (i=0; i<n; i++)
- {
- float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
- call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
- }
+ float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
+ call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
}
- } else {
- // Old method; U as probability between 0 and 1
- if ( bca->fmt_flag & B2B_INFO_RPB )
- call->mwu_pos = calc_mwu_biasZ(bca->ref_pos, bca->alt_pos,
- bca->npos, 0, 0);
- call->mwu_mq = calc_mwu_biasZ(bca->ref_mq, bca->alt_mq,
- bca->nqual, 1, 0);
- call->mwu_bq = calc_mwu_biasZ(bca->ref_bq, bca->alt_bq,
- bca->nqual, 0, 0);
- call->mwu_mqs = calc_mwu_biasZ(bca->fwd_mqs, bca->rev_mqs,
- bca->nqual, 0, 0);
}
-
-#if CDF_MWU_TESTS
- // CDF version of MWU tests is not calculated by default
- if ( bca->fmt_flag & B2B_INFO_RPB )
- call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
- call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
- call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
- call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
-#endif
-
if ( bca->fmt_flag & B2B_INFO_VDB )
call->vdb = calc_vdb(bca->alt_pos, bca->npos);
bc->tmp.l = 0;
// INFO
- if (bc->ori_ref < 0)
+ if ( bc->ori_ref < 0 )
{
bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
- bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
- bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
+ if ( fmt_flag&B2B_INFO_IDV )
+ bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
+ if ( fmt_flag&B2B_INFO_IMF )
+ bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
}
bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
if ( fmt_flag&B2B_INFO_ADF )
if ( has_alt )
{
- if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
- if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
- if ( bc->mwu_nm[0] != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
- if ( bc->mwu_sc != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
- } else {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ if ( fmt_flag&B2B_INFO_MIN_PL_SUM )
+ bcf_update_info_int32(hdr, rec, "MIN_PL_SUM", &bc->shift, 1);
+ if ( fmt_flag&B2B_INFO_VDB && bc->vdb != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( fmt_flag&B2B_INFO_SGB && bc->seg_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+ if ( fmt_flag&B2B_INFO_NM && (bca->nnm[0] || bca->nnm[1]) )
+ {
+ for (i=0; i<2; i++) bc->nm[i] = bca->nnm[i] ? bca->nm[i]/bca->nnm[i] : 0;
+ bcf_update_info_float(hdr, rec, "NM", bc->nm, 2);
}
- if ( bc->strand_bias != HUGE_VAL )
+ if ( fmt_flag&B2B_INFO_RPBZ && bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( fmt_flag&B2B_INFO_MQBZ && bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( fmt_flag&B2B_INFO_MQSBZ && bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( fmt_flag&B2B_INFO_BQBZ && bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( fmt_flag&B2B_INFO_NMBZ && bc->mwu_nm[0] != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
+ if ( fmt_flag&B2B_INFO_SCBZ && bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ if ( fmt_flag&B2B_INFO_FS && bc->strand_bias != HUGE_VAL )
bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
-
-#if CDF_MWU_TESTS
- if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
- if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
- if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
- if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
-#endif
}
tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
- bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
+ if ( fmt_flag&B2B_INFO_MQ0F )
+ bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
// FORMAT
rec->n_sample = bc->n;
#define B2B_INFO_SCR (1<<12)
#define B2B_FMT_SCR (1<<13)
#define B2B_INFO_VDB (1<<14)
-#define B2B_INFO_RPB (1<<15)
-#define B2B_FMT_QS (1<<16)
-#define B2B_INFO_SCB (1<<17)
-#define B2B_FMT_NMBZ (1<<18) // per-sample NMBZ
-#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised
+#define B2B_FMT_QS (1<<15)
+#define B2B_FMT_NMBZ (1<<16) // per-sample NMBZ
+#define B2B_INFO_NMBZ (1<<17)
+#define B2B_INFO_BQBZ (1<<18)
+#define B2B_INFO_MQBZ (1<<19)
+#define B2B_INFO_MQSBZ (1<<20)
+#define B2B_INFO_RPBZ (1<<21)
+#define B2B_INFO_SCBZ (1<<22)
+#define B2B_INFO_SGB (1<<23)
+#define B2B_INFO_MIN_PL_SUM (1<<24)
+#define B2B_INFO_NM (1<<25)
+#define B2B_INFO_MQ0F (1<<26)
+#define B2B_INFO_IDV (1<<27)
+#define B2B_INFO_IMF (1<<28)
+#define B2B_INFO_FS (1<<29)
#define B2B_MAX_ALLELES 5
#define B2B_N_NM 32 // number of NMBZ bins, i.e. max number of mismatches
#define B2B_INC_AD 1
#define B2B_INC_AD0 2
-#define PLP_HAS_SOFT_CLIP(i) ((i)&1)
-#define PLP_HAS_INDEL(i) ((i)&2)
-#define PLP_SAMPLE_ID(i) ((i)>>2)
-#define PLP_SET_SOFT_CLIP(i) ((i)|=1)
-#define PLP_SET_INDEL(i) ((i)|=2)
-#define PLP_SET_SAMPLE_ID(i,n) ((i)|=(n)<<2)
+// Pileup "client data" for each read to cache per-read information
+#define PLP_CD(x) ((plp_cd_t*)((x)->p))
+#define PLP_HAS_SOFT_CLIP(cd) (PLP_CD(cd)->i & 1)
+#define PLP_HAS_INDEL(cd) (PLP_CD(cd)->i & 2)
+#define PLP_IS_REALN(cd) (PLP_CD(cd)->i & 4)
+#define PLP_SAMPLE_ID(cd) (PLP_CD(cd)->i >> 3)
+#define PLP_QLEN(cd) (PLP_CD(cd)->qlen)
+#define PLP_NM(cd) (PLP_CD(cd)->nm)
+#define PLP_NM_UNSET -2
+
+#define PLP_SET_SOFT_CLIP(cd) (PLP_CD(cd)->i |= 1)
+#define PLP_SET_INDEL(cd) (PLP_CD(cd)->i |= 2)
+#define PLP_SET_REALN(cd) (PLP_CD(cd)->i |= 4)
+#define PLP_SET_SAMPLE_ID(cd,n) (PLP_CD(cd)->i |= (n)<<3)
+
+typedef struct
+{
+ int64_t i; // used to store sample id and flags for presence of soft-clip and indel
+ uint32_t qlen; // cached output of bam_cigar2qlen(), 0 if unset
+ int nm; // -2 PLP_NM_UNSET; -1 not available; >=0 NM value computed by get_aux_nm()
+}
+plp_cd_t;
+
typedef struct __bcf_callaux_t {
int fmt_flag, ambig_reads;
// for internal uses
int max_bases;
int indel_types[4]; // indel lengths
- int indel_win_size;
+ int indel_win_size, indels_v20;
int maxins, indelreg;
int read_len;
char *inscns;
void *rghash;
float indel_bias; // adjusts indel score threshold; lower => call more.
int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm}
+ unsigned int nnm[2]; // number of nm observations
+ float nm[2]; // cumulative count of mismatches in ref and alt reads
+ void *iaux; // auxiliary structure for --indels-2.0 calling
+ char *chr; // current chromosome
} bcf_callaux_t;
// per-sample values
bcf_hdr_t *bcf_hdr;
int a[5]; // alleles: ref, alt, alt2, alt3
float qsum[B2B_MAX_ALLELES]; // INFO/QS tag
- int n, n_alleles, shift, ori_ref, unseen;
+ int n, n_alleles, ori_ref, unseen;
+ int32_t shift; // shift is the sum of min_PL before normalization to 0 across all samples
int n_supp; // number of supporting non-reference reads
double anno[16];
unsigned int depth, ori_depth, mq0;
int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS, *ref_nm, *alt_nm;
uint8_t *fmt_arr;
float vdb; // variant distance bias
- float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm;
-#if CDF_MWU_TESTS
- float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
-#endif
+ float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm, nm[2];
float seg_bias;
float strand_bias; // phred-scaled fisher-exact test
kstring_t tmp;
} bcf_call_t;
+
#ifdef __cplusplus
extern "C" {
#endif
int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
const bcf_callaux_t *bca, const char *ref);
int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
+ int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call);
+ int bcf_cgp_l_run(const char *ref, int pos);
+ int est_indelreg(int pos, const char *ref, int l, char *ins4);
+
#ifdef __cplusplus
}
#endif
--- /dev/null
+/* bam2bcf_iaux.c -- modified indel caller
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: pd3@sanger, jkb
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE
+*/
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
+#include "bcftools.h"
+#include "bam2bcf.h"
+#include "read_consensus.h"
+#include "cigar_state.h"
+
+#include <htslib/ksort.h>
+KSORT_INIT_STATIC_GENERIC(uint32_t)
+
+#ifndef DEBUG_ALN
+#define DEBUG_ALN 0
+#endif
+
+#define MAX_TYPES 64
+
+typedef struct
+{
+ int pos; // current position
+ char *chr; // current chromosome
+ int nsmpl; // number of samples
+ int *nplp; // per-sample number of reads
+ bam_pileup1_t **plp; // per-sample reads
+ bcf_callaux_t *bca; // auxiliary bam2bcf structure
+ const char *ref; // reference genome (ASCII)
+ uint32_t *uitmp; // temporary unsigned int array
+ char *inscns; // insertions consensus "ACGTN"[itype*max_ins_len+i]
+ int muitmp, minscns; // size of uitmp, inscns
+ int iref_type, ntypes, types[MAX_TYPES]; // indel types
+ int max_ins_len; // largest insertion
+ int left, right; // consensus sequence boundaries, 0-based fa ref coordinates
+ read_cns_t *rcns; // read consensus
+ cns_seq_t *cns_seq; // array of consensus sequences
+ int *cns_pos; // array of relative pos indexes within cns_seq sequences
+ uint8_t *ref_seq, *qry_seq; // reference and query sequence to align
+ int nref_seq, nqry_seq; // the allocated size of ref_seq and qry_seq
+ uint8_t *qual;
+ int nqual;
+ int *read_scores, // read scores for each indel type [ntypes*iread+itype]
+ mread_scores,
+ ref_qual[MAX_TYPES], // refseq quality at pos for each indel type in the context of homopolymer runs
+ sum_qual[MAX_TYPES]; // qual contributions to each indel type from all reads
+}
+indel_aux_t;
+
+#if DEBUG_ALN
+static void debug_print_types(indel_aux_t *iaux)
+{
+ int i,j;
+ fprintf(stderr,"types at %s:%d ntypes=%d... ",iaux->chr,iaux->pos+1,iaux->ntypes);
+ for (i=0; i<iaux->ntypes; i++)
+ {
+ fprintf(stderr," type%d=",i);
+ if ( iaux->types[i]<=0 )
+ {
+ if ( i==iaux->iref_type ) fprintf(stderr,"%d(ref)",iaux->types[i]);
+ else fprintf(stderr,"%d",iaux->types[i]);
+ continue;
+ }
+ char *cns = &iaux->inscns[i*iaux->max_ins_len];
+ for (j=0; j<iaux->types[i]; j++) fprintf(stderr,"%c","ACGTN"[(int)cns[j]]);
+ }
+ fprintf(stderr,"\n");
+}
+#else
+#define debug_print_types(iaux)
+#endif
+
+void bcf_iaux_destroy(bcf_callaux_t *bca)
+{
+ if ( !bca->iaux ) return;
+ indel_aux_t *iaux = (indel_aux_t*)bca->iaux;
+ free(iaux->uitmp);
+ free(iaux->inscns);
+ free(iaux->ref_seq);
+ free(iaux->qry_seq);
+ free(iaux->qual);
+ free(iaux->read_scores);
+ rcns_destroy(iaux->rcns);
+ free(iaux);
+}
+
+static void iaux_init_sequence_context(indel_aux_t *iaux)
+{
+ // Calculate left and right boundary. The array types is sorted in ascending order, the first
+ // element is the largest deletion (if a deletion present)
+ iaux->left = iaux->pos > iaux->bca->indel_win_size ? iaux->pos - iaux->bca->indel_win_size : 0;
+ iaux->right = iaux->pos + iaux->bca->indel_win_size;
+ if ( iaux->types[0] < 0 ) iaux->right -= iaux->types[0]; // extend by the largest deletion length
+
+ // In case the alignments stand out the reference
+ int i;
+ for (i=iaux->pos; i<iaux->right; i++)
+ if ( !iaux->ref[i] ) break;
+ iaux->right = i;
+
+ // Sequence quality in the context of homopolymers for each indel type
+ int l_run = bcf_cgp_l_run(iaux->ref, iaux->pos); // The length of the homopolymer run around the current position
+ for (i=0; i<iaux->ntypes; i++)
+ {
+ int l = iaux->types[i];
+
+ // This is the original est_seqQ() code. FIXME: check if the inserted sequence is consistent with the homopolymer run
+ int q = iaux->bca->openQ + iaux->bca->extQ * (abs(l) - 1);
+ int qh = l_run >= 3? (int)(iaux->bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
+ if ( q > qh ) q = qh;
+
+ iaux->ref_qual[i] = q < 255 ? q : 255;
+ }
+
+ // Determine the indel region, this makes the difference between e.g. T>TA vs TA>TAA
+ iaux->bca->indelreg = 0;
+ for (i=0; i<iaux->ntypes; i++)
+ {
+ if ( !iaux->types[i] ) continue;
+ int ireg;
+ if ( iaux->types[i] > 0 )
+ ireg = est_indelreg(iaux->pos, iaux->ref, iaux->types[i], &iaux->inscns[i*iaux->max_ins_len]);
+ else
+ ireg = est_indelreg(iaux->pos, iaux->ref, -iaux->types[i], 0);
+ if ( ireg > iaux->bca->indelreg ) iaux->bca->indelreg = ireg;
+ }
+}
+
+static int iaux_init_scores(indel_aux_t *iaux, int ismpl)
+{
+ int n = iaux->nplp[ismpl] * iaux->ntypes;
+ if ( iaux->mread_scores < n )
+ {
+ int *tmp = (int*) realloc(iaux->read_scores,n*sizeof(int));
+ if ( !tmp ) return -1;
+ iaux->mread_scores = n;
+ iaux->read_scores = tmp;
+ }
+ memset(iaux->read_scores,0,n);
+ return 0;
+}
+
+static int _have_indel_reads(indel_aux_t *iaux)
+{
+ int i,j;
+ for (i=0; i<iaux->nsmpl; i++)
+ {
+ for (j=0; j<iaux->nplp[i]; j++)
+ if ( iaux->plp[i][j].indel ) return 1;
+ }
+ return 0;
+}
+
+// For insertions only their sizes were collected so far. Now go through the reads and
+// create consensus sequence for each insert, therefore note that there can be only one
+// sequence per insertion length
+static int iaux_init_ins_types(indel_aux_t *iaux)
+{
+ if ( !iaux->max_ins_len ) return 0;
+
+ uint32_t *aux;
+ int naux = 5 * iaux->ntypes * iaux->max_ins_len;
+ if ( iaux->muitmp < naux )
+ {
+ aux = (uint32_t*) realloc(iaux->uitmp,naux*sizeof(*aux));
+ if ( !aux ) return -1;
+ iaux->uitmp = aux;
+ iaux->muitmp = naux;
+ }
+ else aux = iaux->uitmp;
+ memset(aux,0,naux*sizeof(*aux));
+
+ // count the number of occurrences of each base at each position for each type of insertion
+ int t,s,i,j;
+ for (t=0; t<iaux->ntypes; t++)
+ {
+ if ( iaux->types[t] <= 0) continue;
+ for (s=0; s<iaux->nsmpl; s++)
+ {
+ for (i=0; i<iaux->nplp[s]; i++)
+ {
+ bam_pileup1_t *plp = iaux->plp[s] + i;
+ if ( plp->indel != iaux->types[t] ) continue;
+ uint8_t *seq = bam_get_seq(plp->b);
+ for (j=0; j<plp->indel; j++)
+ {
+ int c = seq_nt16_int[bam_seqi(seq, plp->qpos+j+1)];
+ assert(c<5);
+ aux[5*(t*iaux->max_ins_len+j) + c]++;
+ }
+ }
+ }
+ }
+
+ char *cns;
+ int ncns = iaux->ntypes * iaux->max_ins_len;
+ if ( iaux->minscns < ncns )
+ {
+ cns = (char*) realloc(iaux->inscns,naux*sizeof(*aux));
+ if ( !cns ) return -1;
+ iaux->inscns = cns;
+ iaux->minscns = ncns;
+ }
+ else cns = iaux->inscns;
+ memset(aux,0,ncns*sizeof(*cns));
+
+ // use the majority rule to construct the consensus
+ for (t=0; t<iaux->ntypes; t++)
+ {
+ for (i=0; i<iaux->types[t]; i++) // this naturally includes only insertions
+ {
+ uint32_t *tmp = &aux[5*(t*iaux->max_ins_len+i)], max = tmp[0], max_j = 0;
+ for (j=1; j<5; j++)
+ if ( max < tmp[j] ) max = tmp[j], max_j = j;
+ cns[t*iaux->max_ins_len + i] = max ? max_j : 4;
+ if ( max_j==4 ) { iaux->types[t] = 0; break; } // discard insertions which contain N's
+ }
+ }
+ return 0;
+}
+
+#define MINUS_CONST 0x10000000
+static int iaux_init_types(indel_aux_t *iaux)
+{
+ if ( !_have_indel_reads(iaux) ) return 0;
+
+ iaux->bca->max_support = 0;
+ memset(iaux->sum_qual,0,MAX_TYPES*sizeof(*iaux->sum_qual));
+
+ int i,j, nreads = 0;
+ for (i=0; i<iaux->nsmpl; i++) nreads += iaux->nplp[i];
+
+ uint32_t *aux;
+ if ( iaux->muitmp < nreads+1 )
+ {
+ aux = (uint32_t*) realloc(iaux->uitmp,(nreads+1)*sizeof(*iaux->uitmp));
+ if ( !aux ) return -1;
+ iaux->uitmp = aux;
+ iaux->muitmp = nreads+1;
+ }
+ else aux = iaux->uitmp;
+ memset(aux,0,(nreads+1)*sizeof(*aux));
+
+ int naux = 0, indel_support_ok = 0, n_alt = 0, n_tot = 0;
+ int max_rd_len = 0; // max sequence length that includes ref+del bases
+
+ // Fill out aux[] array with all the non-zero indel sizes. This is an unsorted list with as many
+ // entries as there are reads
+ aux[naux++] = MINUS_CONST; // zero indel is always a type (REF)
+ for (i=0; i<iaux->nsmpl; i++)
+ {
+ int nalt = naux, ntot = 0; // per sample values
+ for (j=0; j<iaux->nplp[i]; j++)
+ {
+ const bam_pileup1_t *plp = iaux->plp[i] + j;
+ ntot++;
+ if ( plp->indel ) aux[naux++] = MINUS_CONST + plp->indel;
+ if ( !PLP_QLEN(&plp->cd) ) PLP_QLEN(&plp->cd) = bam_cigar2qlen(plp->b->core.n_cigar, bam_get_cigar(plp->b));
+ if ( PLP_QLEN(&plp->cd) > max_rd_len ) max_rd_len = PLP_QLEN(&plp->cd);
+ }
+ nalt = naux - nalt;
+ if ( iaux->bca->per_sample_flt )
+ {
+ double frac = (double)nalt/naux;
+ if ( nalt >= iaux->bca->min_support && frac >= iaux->bca->min_frac ) indel_support_ok = 1;
+ if ( nalt > iaux->bca->max_support && frac > 0 ) iaux->bca->max_support = nalt, iaux->bca->max_frac = frac;
+ }
+ else
+ {
+ n_alt += nalt;
+ n_tot += ntot;
+ }
+ }
+
+ // Check if the minimum required number of indel reads has been observed
+ if ( !iaux->bca->per_sample_flt && n_alt >= iaux->bca->min_support && (double)n_alt/n_tot >= iaux->bca->min_frac ) indel_support_ok = 1;
+ if ( naux==1 || !indel_support_ok ) return 0;
+
+ // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), check the number of N's in the
+ // sequence and skip places where half or more reference bases in the sequence that follows pos are Ns
+ int nN = 0, i_end = iaux->pos + (iaux->bca->indel_win_size < max_rd_len ? iaux->bca->indel_win_size : max_rd_len);
+ for (i=iaux->pos; i<i_end && iaux->ref[i]; i++)
+ if ( iaux->ref[i] == 'N' ) nN++;
+ if ( 2*nN > i - iaux->pos ) return -1;
+
+ // Sort aux[] and dedup indel types
+ int n_types = 1;
+ ks_introsort(uint32_t, naux, aux);
+ for (i=1; i<naux; i++)
+ if ( aux[i] != aux[i-1] ) n_types++;
+
+ if ( n_types >= MAX_TYPES )
+ {
+ static int warned = 0;
+ if ( !warned )
+ {
+ fprintf(stderr, "Warning: excessive number of INDEL alleles at %s:%d, skipping. (This warning is printed only once)\n",iaux->chr,iaux->pos+1);
+ warned = 1;
+ }
+ return -1;
+ }
+
+ // Fill out the types[] array detailing the size of insertion or deletion.
+ iaux->ntypes = 0;
+ iaux->max_ins_len = 0;
+ for (i=0; i<naux; i++)
+ {
+ int isize = (int32_t)(aux[i] - MINUS_CONST);
+ for (j=i+1; j<naux; j++)
+ if ( aux[j] != aux[i] ) break;
+
+ // Only include the REF type and types with sufficient support. Note that the position
+ // already passed, this is just to reduce the number of indel types. The check is
+ // permissive, the thresholds min_support and min_frac are not enforced in per-sample mode
+ int is_ok = 0;
+ if ( !isize )
+ {
+ is_ok = 1;
+ iaux->iref_type = iaux->ntypes;
+ }
+ else
+ {
+ if ( j-i >= iaux->bca->min_support ) is_ok = 1;
+ // What is the best way to handle the -pmF options:
+ // - consider only sites where a single indel type passes the -mF threshold, as opposed to all indel types cumulatively
+ // - once a site passes, include all indel types in the evaluation, as opposed to considering only the strong candidates
+ // In this implementation sites are selected by counting reads from all indel types cumulatively and all indel types
+ // are considered.
+ // Uncomment the following condition to consider only strong indel candidates once the site has been selected
+ // if ( !iaux->bca->per_sample_flt && (double)(j-i) / n_tot < iaux->bca->min_frac ) is_ok = 0;
+ }
+ if ( is_ok )
+ {
+ iaux->types[iaux->ntypes++] = isize;
+ if ( isize > 0 && isize > iaux->max_ins_len ) iaux->max_ins_len = isize;
+ }
+ i = j-1;
+ }
+ if ( iaux->ntypes <= 1 ) return 0;
+
+ // Init insertion types, including their sequence
+ if ( iaux_init_ins_types(iaux) < 0 ) return -1;
+
+ iaux_init_sequence_context(iaux);
+
+ return iaux->ntypes;
+}
+#undef MINUS_CONST
+
+static int iaux_set_consensus(indel_aux_t *iaux, int ismpl)
+{
+ if ( !iaux->rcns )
+ iaux->rcns = rcns_init(iaux->pos, iaux->left, iaux->right);
+ else
+ rcns_reset(iaux->rcns, iaux->pos, iaux->left, iaux->right);
+
+ rcns_set_reads(iaux->rcns, iaux->plp[ismpl], iaux->nplp[ismpl]);
+
+ iaux->cns_seq = rcns_get_consensus(iaux->rcns, iaux->ref + iaux->left);
+
+// todo:
+// rcns should also collect localized number of mismatches as a substitute
+// for uninformative MQ. This would not affect calling but would help with
+// filtering
+
+ return 0;
+}
+
+#if 0
+// Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no
+// such value, the largest index with value smaller than pos. Starts at initial guess ioff.
+// This could use a binary search but the assumption is that the initial guess is indel-size close
+// to the actuall coordinate.
+//
+// TODO: remove this function and seq_pos from cns creation as it seems unnecessary
+static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff)
+{
+ if ( ioff<0 ) ioff = 0;
+ else if ( ioff >= nseq_pos ) ioff = nseq_pos - 1;
+ if ( seq_pos[ioff] < pos )
+ {
+ while ( ioff+1 < nseq_pos && seq_pos[ioff] < pos ) ioff++;
+ if ( seq_pos[ioff] > pos ) ioff--;
+ return ioff;
+ }
+ while ( ioff > 0 && seq_pos[ioff-1] >= pos ) ioff--;
+ return ioff;
+}
+#endif
+
+static int iaux_align_read(indel_aux_t *iaux, bam1_t *bam, uint8_t *ref_seq, int nref_seq)
+{
+ if ( bam->core.flag & BAM_FUNMAP ) return 1; // skip unmapped reads
+
+ // Trim both ref and qry to the window of interest
+ hts_pos_t ref_beg = iaux->left; // fa ref coordinates
+ hts_pos_t ref_end = iaux->right < ref_beg + nref_seq ? iaux->right : ref_beg + nref_seq - 1;
+
+ cigar_state_t cigar;
+ cstate_init(&cigar,bam);
+ int qry_off1, qry_off2, ref_off1, ref_off2;
+ if ( ref_beg > bam->core.pos )
+ {
+ // the read needs trimming from left
+ qry_off1 = cstate_seek_fwd(&cigar, &ref_beg, 1);
+ ref_off1 = ref_beg - iaux->left;
+
+ if ( ref_beg + (bam->core.l_qseq - qry_off1) > ref_end )
+ {
+ // the read needs trimming from right
+ qry_off2 = ref_end - ref_beg + qry_off1;
+ ref_off2 = ref_end - iaux->left;
+ }
+ else
+ {
+ // the ref template needs trimming from right
+ qry_off2 = bam->core.l_qseq - 1;
+ ref_off2 = ref_off1 + qry_off2 - qry_off1;
+ }
+ }
+ else
+ {
+ // the ref template needs trimming from left
+ qry_off1 = 0;
+ ref_off1 = bam->core.pos - ref_beg;
+
+ if ( bam->core.pos + bam->core.l_qseq - 1 > ref_end )
+ {
+ // the read needs trimming from right
+ ref_off2 = ref_end - iaux->left;
+ qry_off2 = ref_off2 - ref_off1;
+ }
+ else
+ {
+ // the ref template needs trimming from right
+ qry_off2 = bam->core.l_qseq - 1;
+ ref_off2 = ref_off1 + qry_off2 - qry_off1;
+ }
+ }
+//fprintf(stderr,"xtrim: %s .. left,right=%d,%d rbeg,end=%d,%d qpos=%d qlen=%d qoff=%d,%d roff=%d,%d rlen=%d\n",bam_get_qname(bam),iaux->left,iaux->right,(int)ref_beg,(int)ref_end,(int)bam->core.pos,bam->core.l_qseq, qry_off1,qry_off2,ref_off1,ref_off2,nref_seq);
+
+ assert( qry_off1<=qry_off2 );
+ assert( qry_off1>=0 && qry_off1<bam->core.l_qseq );
+ assert( qry_off2>=0 && qry_off2<bam->core.l_qseq );
+
+ assert( ref_off1<=ref_off2 );
+ assert( ref_off1>=0 && ref_off1<nref_seq );
+ assert( ref_off2>=0 && ref_off2<nref_seq );
+
+ // prepare query sequence
+ int i, qlen = qry_off2 - qry_off1 + 1, rlen = ref_off2 - ref_off1 + 1;
+ if ( iaux->nqry_seq < qlen )
+ {
+ uint8_t *tmp = (uint8_t*) realloc(iaux->qry_seq, qlen);
+ if ( !tmp ) return -1; // critical error
+ iaux->qry_seq = tmp;
+ iaux->nqry_seq = qlen;
+ }
+ uint8_t *seq = bam_get_seq(bam);
+ for (i=qry_off1; i<=qry_off2; i++) iaux->qry_seq[i-qry_off1] = seq_nt16_int[bam_seqi(seq,i)];
+
+ // prepare qualities, either BQ or BAQ qualities (ZQ)
+ if ( iaux->nqual < qlen )
+ {
+ uint8_t *tmp = (uint8_t*) realloc(iaux->qual, qlen);
+ if ( !tmp ) return -1; // critical error
+ iaux->qual = tmp;
+ iaux->nqual = qlen;
+ }
+ uint8_t *qual = iaux->qual;
+ const uint8_t *qq = bam_get_qual(bam);
+ const uint8_t *bq = (uint8_t*)bam_aux_get(bam, "ZQ");
+ if ( bq ) bq++; // skip type
+ for (i=qry_off1; i<=qry_off2; i++)
+ {
+ int j = i - qry_off1;
+ qual[j] = bq ? qq[i] + (bq[i] - 64) : qq[i];
+ if ( qual[j] > 30 ) qual[j] = 30;
+ if ( qual[j] < 7 ) qual[j] = 7;
+ }
+
+// Illumina
+probaln_par_t apf = { 1e-4, 1e-2, 10 };
+
+ // align
+ int score = probaln_glocal(ref_seq + ref_off1, rlen, iaux->qry_seq, qlen, qual, &apf, 0, 0);
+ int adj_score = (int)(100. * score / qlen + .499) * iaux->bca->indel_bias;
+
+#if DEBUG_ALN
+ fprintf(stderr,"aln: %d/%d\t%s\n\tref: ",score,adj_score,bam_get_qname(bam));
+ for (i=0; i<rlen; i++) fprintf(stderr,"%c","ACGTN"[(int)ref_seq[ref_off1 + i]]);
+ fprintf(stderr,"\n\tqry: ");
+ for (i=0; i<qlen; i++) fprintf(stderr,"%c","ACGTN"[(int)iaux->qry_seq[i]]);
+ fprintf(stderr,"\n\tqual: ");
+ for (i=0; i<qlen; i++) fprintf(stderr,"%c",(char)(qual[i]+64));
+ fprintf(stderr,"\n\ttrim: qry_len=%d qry_off=%d,%d ref_len=%d ref_off=%d,%d ref_beg,end=%d,%d\n",qlen,qry_off1,qry_off2,rlen,ref_off1,ref_off2,(int)ref_beg,(int)ref_end);
+#endif
+
+ if ( adj_score > 255 ) adj_score = 255;
+ return score<<8 | adj_score;
+}
+
+// Score all reads for this sample and indel type using the up to two consensus sequence templates.
+// On output sets iaux->read_scores[iread*ntypes+itype] = (raw_score<<8 | length_adjusted_score)
+static int iaux_score_reads(indel_aux_t *iaux, int ismpl, int itype)
+{
+ int i;
+ cns_seq_t *cns = iaux->cns_seq;
+ while ( cns->nseq )
+ {
+ // Resize buffers if necessary
+ int ref_len = cns->nseq + iaux->types[itype];
+ if ( iaux->nref_seq < ref_len )
+ {
+ uint8_t *ref_buf = (uint8_t*) realloc(iaux->ref_seq,sizeof(uint8_t)*ref_len);
+ if ( !ref_buf ) return -1;
+ iaux->ref_seq = ref_buf;
+ iaux->nref_seq = ref_len;
+ }
+
+ // Apply the indel and create the template ref sequence...
+ memcpy(iaux->ref_seq,cns->seq,(cns->ipos+1)*sizeof(*iaux->ref_seq));
+ if ( iaux->types[itype] < 0 ) // deletion
+ memcpy(iaux->ref_seq + cns->ipos + 1, cns->seq + cns->ipos + 1 - iaux->types[itype], (cns->nseq - cns->ipos - 1 + iaux->types[itype])*sizeof(*iaux->ref_seq));
+ else
+ {
+ char *ins = &iaux->inscns[itype*iaux->max_ins_len];
+ for (i=0; i<iaux->types[itype]; i++) iaux->ref_seq[cns->ipos+1+i] = ins[i];
+ memcpy(iaux->ref_seq + cns->ipos + 1 + iaux->types[itype], cns->seq + 1 + cns->ipos, (cns->nseq - cns->ipos - 1)*sizeof(*iaux->ref_seq));
+ }
+
+#if DEBUG_ALN
+ fprintf(stderr,"template %d, type %d, sample %d: ",cns==iaux->cns_seq?0:1,itype,ismpl);
+ for (i=0; i<ref_len; i++) fprintf(stderr,"%c","ACGTN"[(int)iaux->ref_seq[i]]);
+ fprintf(stderr,"\n");
+#endif
+
+ // Align and score reads
+ for (i=0; i<iaux->nplp[ismpl]; i++)
+ {
+ const bam_pileup1_t *plp = iaux->plp[ismpl] + i;
+ int aln_score = iaux_align_read(iaux, plp->b, iaux->ref_seq, ref_len);
+ int *score = &iaux->read_scores[i*iaux->ntypes+itype];
+ if ( cns==iaux->cns_seq || *score > aln_score ) *score = aln_score;
+ }
+ cns++;
+ }
+ return 0;
+}
+
+// Determines indel quality for each read and populates 22 bits of pileup aux field with
+// three integers as follows
+// plp->aux = indel_type << 16 | seqQ << 8 | indelQ
+static int iaux_eval_scored_reads(indel_aux_t *iaux, int ismpl)
+{
+ int i,j;
+ for (i=0; i<iaux->nplp[ismpl]; i++)
+ {
+ bam_pileup1_t *plp = iaux->plp[ismpl] + i;
+
+ // Find the best indel type and the ref type, their scores difference is the indel quality
+ int *score = &iaux->read_scores[i*iaux->ntypes];
+ int alt_score = INT_MAX, alt_j = 0;
+ for (j=0; j<iaux->iref_type; j++)
+ if ( alt_score > score[j] ) alt_score = score[j], alt_j = j;
+ for (j=iaux->iref_type+1; j<iaux->ntypes; j++)
+ if ( alt_score > score[j] ) alt_score = score[j], alt_j = j;
+ int ref_score = score[iaux->iref_type];
+ int sc0, sc1, j0;
+ if ( alt_score < ref_score ) sc0 = alt_score, sc1 = ref_score, j0 = alt_j;
+ else sc0 = ref_score, sc1 = alt_score, j0 = iaux->iref_type;
+
+ int indelQ = (sc1>>8) - (sc0>>8); // low=bad, high=good
+ int seqQ = iaux->ref_qual[alt_j];
+
+ // Reduce indelQ. High length-normalized alignment scores (i.e. bad alignments)
+ // lower the quality more (e.g. gnuplot> plot [0:111] (1-x/111.)*255)
+ int len_normQ = sc0 & 0xff; // length-normalized score of the best match (ref or alt)
+ int adj_indelQ; // final indelQ used in calling
+ if ( len_normQ > 111 )
+ {
+ // In the original code reads matching badly to any indel type or reference had indelQ set to 0
+ // here and thus would be effectively removed from calling. This leads to problems when there are
+ // many soft clipped reads and a few good matching indel reads (see noisy-softclips.bam in
+ // mpileup-tests). Only the few good quality indel reads would become visible to the caller and
+ // the indel would be called with high quality. Here we change the logic to make the badly matching
+ // reads low quality reference reads. The threshold was set to make the test case still be called
+ // as an indel, but with very low quality.
+ //
+ // Original code:
+ // adj_indelQ = 0;
+ //
+ adj_indelQ = 12;
+ j0 = iaux->iref_type;
+ }
+ else
+ adj_indelQ = (int)((1. - len_normQ/111.) * indelQ + .499);
+
+#if DEBUG_ALN
+ // Prints the selected indel type (itype); adjusted indelQ which will be used if bigger than seqQ;
+ // raw indelQ; length-normalized indelQ and sequence context quality; ref and best alt indel type
+ // and their raw and length-normalized scores
+ fprintf(stderr,"itype=%d adj_indelQ=%d\trawQ=%d\tlen_normQ=%d\tseqQ=%d\tref:%d=%d/%d alt:%d=%d/%d)\t%s\n",
+ j0,adj_indelQ,indelQ,len_normQ,seqQ,iaux->iref_type,ref_score>>8,ref_score&0xff,alt_j,alt_score>>8,alt_score&0xff,bam_get_qname(plp->b));
+#endif
+
+ if ( adj_indelQ > seqQ ) adj_indelQ = seqQ; // seqQ already capped at 255
+ plp->aux = j0<<16 | seqQ<<8 | adj_indelQ; // use 22 bits in total
+ iaux->sum_qual[j0] += adj_indelQ;
+ }
+ return 0;
+}
+
+// Find the best indel types, include the ref type plus maximum three alternate indel alleles.
+static int iaux_eval_best_indels(indel_aux_t *iaux)
+{
+ bcf_callaux_t *bca = iaux->bca;
+ bca->maxins = iaux->max_ins_len;
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+ if ( bca->maxins && !bca->inscns ) return -1;
+
+ // insertion sort, descending, high-quality indels come first
+ int i,j,t, tmp, *sumq = iaux->sum_qual, ntypes = iaux->ntypes;
+ for (t=0; t<ntypes; t++) sumq[t] = sumq[t]<<6 | t;
+ for (t=1; t<ntypes; t++)
+ for (j=t; j>0 && sumq[j] > sumq[j-1]; j--)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t=0; t<ntypes; t++) // look for the reference type
+ if ( (sumq[t]&0x3f)==iaux->iref_type ) break;
+ if ( t )
+ {
+ // move the reference type to the first
+ tmp = sumq[t];
+ for (; t>0; t--) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+
+ // Initialize bca's structures and create a mapping between old and new types
+ int old2new_type[MAX_TYPES];
+ for (t=0; t<iaux->ntypes; t++)
+ {
+ int itype = sumq[t] & 0x3f;
+ old2new_type[itype] = t;
+ if ( t>=4 ) continue;
+ bca->indel_types[t] = iaux->types[itype];
+ if ( bca->indel_types[t] <= 0 ) continue;
+ memcpy(&bca->inscns[t*bca->maxins], &iaux->inscns[itype*iaux->max_ins_len], bca->maxins);
+ }
+
+ // Update indel type in plp->aux for all reads
+ int ismpl, n_alt = 0;
+ for (ismpl=0; ismpl<iaux->nsmpl; ismpl++)
+ {
+ for (i=0; i<iaux->nplp[ismpl]; i++)
+ {
+ bam_pileup1_t *plp = iaux->plp[ismpl] + i;
+ int itype_old = (plp->aux >> 16) & 0x3f;
+ int itype_new = old2new_type[itype_old];
+ plp->aux = itype_new<<16 | (itype_new>=4 ? 0 : (plp->aux & 0xffff));
+ if ( itype_new>0 ) n_alt++;
+ }
+ }
+ return n_alt;
+}
+
+/*
+ notes:
+ - n .. number of samples
+ - the routine sets bam_pileup1_t.aux (27 bits) of each read as follows:
+ - 5: unused
+ - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
+ - 8: estimated sequence quality .. (aux>>8)&0xff
+ - 8: indel quality .. aux&0xff
+ */
+int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+{
+assert(!(ref == 0 || bca == 0)); // can this ever happen? when?
+ if (ref == 0 || bca == 0) return -1;
+
+ if ( !bca->iaux ) bca->iaux = calloc(1,sizeof(indel_aux_t));
+ indel_aux_t *iaux = bca->iaux;
+ iaux->nsmpl = n;
+ iaux->nplp = n_plp;
+ iaux->plp = plp;
+ iaux->bca = bca;
+ iaux->ref = ref;
+ iaux->pos = pos;
+ iaux->chr = bca->chr;
+
+ // Check if there is an indel at this position and if yes, find all indel types and determine
+ // window boundaries. todo: We want this information cached so that for long reads we don't keep
+ // redoing the whole analysis again and again
+ int ntypes = iaux_init_types(iaux);
+ if ( ntypes<=0 ) return -1;
+
+ debug_print_types(iaux);
+
+ // Create two template consensus sequences for each sample (assuming max diploid organism).
+ // Then apply each indel type on top of the templates, realign every read and remember score
+ int i,j;
+ for (i=0; i<iaux->nsmpl; i++)
+ {
+ iaux_set_consensus(iaux, i);
+ iaux_init_scores(iaux, i);
+ for (j=0; j<ntypes; j++) iaux_score_reads(iaux, i, j);
+ iaux_eval_scored_reads(iaux, i);
+ }
+ int nalt = iaux_eval_best_indels(iaux);
+ return nalt > 0 ? 0 : -1;
+}
+
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* bam2bcf_iaux.c -- modified indel caller
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: pd3@sanger, jkb
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE
+*/
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
+#include "bcftools.h"
+#include "bam2bcf.h"
+#include "read_consensus.h"
+#include "cigar_state.h"
+
+#include <htslib/ksort.h>
+KSORT_INIT_STATIC_GENERIC(uint32_t)
+
+#ifndef DEBUG_ALN
+#define DEBUG_ALN 0
+#endif
+
+#define MAX_TYPES 64
+
+typedef struct
+{
+ int pos; // current position
+ char *chr; // current chromosome
+ int nsmpl; // number of samples
+ int *nplp; // per-sample number of reads
+ bam_pileup1_t **plp; // per-sample reads
+ bcf_callaux_t *bca; // auxiliary bam2bcf structure
+ const char *ref; // reference genome (ASCII)
+ uint32_t *uitmp; // temporary unsigned int array
+ char *inscns; // insertions consensus "ACGTN"[itype*max_ins_len+i]
+ int muitmp, minscns; // size of uitmp, inscns
+ int iref_type, ntypes, types[MAX_TYPES]; // indel types
+ int max_ins_len; // largest insertion
+ int left, right; // consensus sequence boundaries, 0-based fa ref coordinates
+ read_cns_t *rcns; // read consensus
+ cns_seq_t *cns_seq; // array of consensus sequences
+ int *cns_pos; // array of relative pos indexes within cns_seq sequences
+ uint8_t *ref_seq, *qry_seq; // reference and query sequence to align
+ int nref_seq, nqry_seq; // the allocated size of ref_seq and qry_seq
+ uint8_t *qual;
+ int nqual;
+ int *read_scores, // read scores for each indel type [ntypes*iread+itype]
+ mread_scores,
+ ref_qual[MAX_TYPES], // refseq quality at pos for each indel type in the context of homopolymer runs
+ sum_qual[MAX_TYPES]; // qual contributions to each indel type from all reads
+}
+indel_aux_t;
+
+#if DEBUG_ALN
+static void debug_print_types(indel_aux_t *iaux)
+{
+ int i,j;
+ fprintf(bcftools_stderr,"types at %s:%d ntypes=%d... ",iaux->chr,iaux->pos+1,iaux->ntypes);
+ for (i=0; i<iaux->ntypes; i++)
+ {
+ fprintf(bcftools_stderr," type%d=",i);
+ if ( iaux->types[i]<=0 )
+ {
+ if ( i==iaux->iref_type ) fprintf(bcftools_stderr,"%d(ref)",iaux->types[i]);
+ else fprintf(bcftools_stderr,"%d",iaux->types[i]);
+ continue;
+ }
+ char *cns = &iaux->inscns[i*iaux->max_ins_len];
+ for (j=0; j<iaux->types[i]; j++) fprintf(bcftools_stderr,"%c","ACGTN"[(int)cns[j]]);
+ }
+ fprintf(bcftools_stderr,"\n");
+}
+#else
+#define debug_print_types(iaux)
+#endif
+
+void bcf_iaux_destroy(bcf_callaux_t *bca)
+{
+ if ( !bca->iaux ) return;
+ indel_aux_t *iaux = (indel_aux_t*)bca->iaux;
+ free(iaux->uitmp);
+ free(iaux->inscns);
+ free(iaux->ref_seq);
+ free(iaux->qry_seq);
+ free(iaux->qual);
+ free(iaux->read_scores);
+ rcns_destroy(iaux->rcns);
+ free(iaux);
+}
+
+static void iaux_init_sequence_context(indel_aux_t *iaux)
+{
+ // Calculate left and right boundary. The array types is sorted in ascending order, the first
+ // element is the largest deletion (if a deletion present)
+ iaux->left = iaux->pos > iaux->bca->indel_win_size ? iaux->pos - iaux->bca->indel_win_size : 0;
+ iaux->right = iaux->pos + iaux->bca->indel_win_size;
+ if ( iaux->types[0] < 0 ) iaux->right -= iaux->types[0]; // extend by the largest deletion length
+
+ // In case the alignments stand out the reference
+ int i;
+ for (i=iaux->pos; i<iaux->right; i++)
+ if ( !iaux->ref[i] ) break;
+ iaux->right = i;
+
+ // Sequence quality in the context of homopolymers for each indel type
+ int l_run = bcf_cgp_l_run(iaux->ref, iaux->pos); // The length of the homopolymer run around the current position
+ for (i=0; i<iaux->ntypes; i++)
+ {
+ int l = iaux->types[i];
+
+ // This is the original est_seqQ() code. FIXME: check if the inserted sequence is consistent with the homopolymer run
+ int q = iaux->bca->openQ + iaux->bca->extQ * (abs(l) - 1);
+ int qh = l_run >= 3? (int)(iaux->bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
+ if ( q > qh ) q = qh;
+
+ iaux->ref_qual[i] = q < 255 ? q : 255;
+ }
+
+ // Determine the indel region, this makes the difference between e.g. T>TA vs TA>TAA
+ iaux->bca->indelreg = 0;
+ for (i=0; i<iaux->ntypes; i++)
+ {
+ if ( !iaux->types[i] ) continue;
+ int ireg;
+ if ( iaux->types[i] > 0 )
+ ireg = est_indelreg(iaux->pos, iaux->ref, iaux->types[i], &iaux->inscns[i*iaux->max_ins_len]);
+ else
+ ireg = est_indelreg(iaux->pos, iaux->ref, -iaux->types[i], 0);
+ if ( ireg > iaux->bca->indelreg ) iaux->bca->indelreg = ireg;
+ }
+}
+
+static int iaux_init_scores(indel_aux_t *iaux, int ismpl)
+{
+ int n = iaux->nplp[ismpl] * iaux->ntypes;
+ if ( iaux->mread_scores < n )
+ {
+ int *tmp = (int*) realloc(iaux->read_scores,n*sizeof(int));
+ if ( !tmp ) return -1;
+ iaux->mread_scores = n;
+ iaux->read_scores = tmp;
+ }
+ memset(iaux->read_scores,0,n);
+ return 0;
+}
+
+static int _have_indel_reads(indel_aux_t *iaux)
+{
+ int i,j;
+ for (i=0; i<iaux->nsmpl; i++)
+ {
+ for (j=0; j<iaux->nplp[i]; j++)
+ if ( iaux->plp[i][j].indel ) return 1;
+ }
+ return 0;
+}
+
+// For insertions only their sizes were collected so far. Now go through the reads and
+// create consensus sequence for each insert, therefore note that there can be only one
+// sequence per insertion length
+static int iaux_init_ins_types(indel_aux_t *iaux)
+{
+ if ( !iaux->max_ins_len ) return 0;
+
+ uint32_t *aux;
+ int naux = 5 * iaux->ntypes * iaux->max_ins_len;
+ if ( iaux->muitmp < naux )
+ {
+ aux = (uint32_t*) realloc(iaux->uitmp,naux*sizeof(*aux));
+ if ( !aux ) return -1;
+ iaux->uitmp = aux;
+ iaux->muitmp = naux;
+ }
+ else aux = iaux->uitmp;
+ memset(aux,0,naux*sizeof(*aux));
+
+ // count the number of occurrences of each base at each position for each type of insertion
+ int t,s,i,j;
+ for (t=0; t<iaux->ntypes; t++)
+ {
+ if ( iaux->types[t] <= 0) continue;
+ for (s=0; s<iaux->nsmpl; s++)
+ {
+ for (i=0; i<iaux->nplp[s]; i++)
+ {
+ bam_pileup1_t *plp = iaux->plp[s] + i;
+ if ( plp->indel != iaux->types[t] ) continue;
+ uint8_t *seq = bam_get_seq(plp->b);
+ for (j=0; j<plp->indel; j++)
+ {
+ int c = seq_nt16_int[bam_seqi(seq, plp->qpos+j+1)];
+ assert(c<5);
+ aux[5*(t*iaux->max_ins_len+j) + c]++;
+ }
+ }
+ }
+ }
+
+ char *cns;
+ int ncns = iaux->ntypes * iaux->max_ins_len;
+ if ( iaux->minscns < ncns )
+ {
+ cns = (char*) realloc(iaux->inscns,naux*sizeof(*aux));
+ if ( !cns ) return -1;
+ iaux->inscns = cns;
+ iaux->minscns = ncns;
+ }
+ else cns = iaux->inscns;
+ memset(aux,0,ncns*sizeof(*cns));
+
+ // use the majority rule to construct the consensus
+ for (t=0; t<iaux->ntypes; t++)
+ {
+ for (i=0; i<iaux->types[t]; i++) // this naturally includes only insertions
+ {
+ uint32_t *tmp = &aux[5*(t*iaux->max_ins_len+i)], max = tmp[0], max_j = 0;
+ for (j=1; j<5; j++)
+ if ( max < tmp[j] ) max = tmp[j], max_j = j;
+ cns[t*iaux->max_ins_len + i] = max ? max_j : 4;
+ if ( max_j==4 ) { iaux->types[t] = 0; break; } // discard insertions which contain N's
+ }
+ }
+ return 0;
+}
+
+#define MINUS_CONST 0x10000000
+static int iaux_init_types(indel_aux_t *iaux)
+{
+ if ( !_have_indel_reads(iaux) ) return 0;
+
+ iaux->bca->max_support = 0;
+ memset(iaux->sum_qual,0,MAX_TYPES*sizeof(*iaux->sum_qual));
+
+ int i,j, nreads = 0;
+ for (i=0; i<iaux->nsmpl; i++) nreads += iaux->nplp[i];
+
+ uint32_t *aux;
+ if ( iaux->muitmp < nreads+1 )
+ {
+ aux = (uint32_t*) realloc(iaux->uitmp,(nreads+1)*sizeof(*iaux->uitmp));
+ if ( !aux ) return -1;
+ iaux->uitmp = aux;
+ iaux->muitmp = nreads+1;
+ }
+ else aux = iaux->uitmp;
+ memset(aux,0,(nreads+1)*sizeof(*aux));
+
+ int naux = 0, indel_support_ok = 0, n_alt = 0, n_tot = 0;
+ int max_rd_len = 0; // max sequence length that includes ref+del bases
+
+ // Fill out aux[] array with all the non-zero indel sizes. This is an unsorted list with as many
+ // entries as there are reads
+ aux[naux++] = MINUS_CONST; // zero indel is always a type (REF)
+ for (i=0; i<iaux->nsmpl; i++)
+ {
+ int nalt = naux, ntot = 0; // per sample values
+ for (j=0; j<iaux->nplp[i]; j++)
+ {
+ const bam_pileup1_t *plp = iaux->plp[i] + j;
+ ntot++;
+ if ( plp->indel ) aux[naux++] = MINUS_CONST + plp->indel;
+ if ( !PLP_QLEN(&plp->cd) ) PLP_QLEN(&plp->cd) = bam_cigar2qlen(plp->b->core.n_cigar, bam_get_cigar(plp->b));
+ if ( PLP_QLEN(&plp->cd) > max_rd_len ) max_rd_len = PLP_QLEN(&plp->cd);
+ }
+ nalt = naux - nalt;
+ if ( iaux->bca->per_sample_flt )
+ {
+ double frac = (double)nalt/naux;
+ if ( nalt >= iaux->bca->min_support && frac >= iaux->bca->min_frac ) indel_support_ok = 1;
+ if ( nalt > iaux->bca->max_support && frac > 0 ) iaux->bca->max_support = nalt, iaux->bca->max_frac = frac;
+ }
+ else
+ {
+ n_alt += nalt;
+ n_tot += ntot;
+ }
+ }
+
+ // Check if the minimum required number of indel reads has been observed
+ if ( !iaux->bca->per_sample_flt && n_alt >= iaux->bca->min_support && (double)n_alt/n_tot >= iaux->bca->min_frac ) indel_support_ok = 1;
+ if ( naux==1 || !indel_support_ok ) return 0;
+
+ // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), check the number of N's in the
+ // sequence and skip places where half or more reference bases in the sequence that follows pos are Ns
+ int nN = 0, i_end = iaux->pos + (iaux->bca->indel_win_size < max_rd_len ? iaux->bca->indel_win_size : max_rd_len);
+ for (i=iaux->pos; i<i_end && iaux->ref[i]; i++)
+ if ( iaux->ref[i] == 'N' ) nN++;
+ if ( 2*nN > i - iaux->pos ) return -1;
+
+ // Sort aux[] and dedup indel types
+ int n_types = 1;
+ ks_introsort(uint32_t, naux, aux);
+ for (i=1; i<naux; i++)
+ if ( aux[i] != aux[i-1] ) n_types++;
+
+ if ( n_types >= MAX_TYPES )
+ {
+ static int warned = 0;
+ if ( !warned )
+ {
+ fprintf(bcftools_stderr, "Warning: excessive number of INDEL alleles at %s:%d, skipping. (This warning is printed only once)\n",iaux->chr,iaux->pos+1);
+ warned = 1;
+ }
+ return -1;
+ }
+
+ // Fill out the types[] array detailing the size of insertion or deletion.
+ iaux->ntypes = 0;
+ iaux->max_ins_len = 0;
+ for (i=0; i<naux; i++)
+ {
+ int isize = (int32_t)(aux[i] - MINUS_CONST);
+ for (j=i+1; j<naux; j++)
+ if ( aux[j] != aux[i] ) break;
+
+ // Only include the REF type and types with sufficient support. Note that the position
+ // already passed, this is just to reduce the number of indel types. The check is
+ // permissive, the thresholds min_support and min_frac are not enforced in per-sample mode
+ int is_ok = 0;
+ if ( !isize )
+ {
+ is_ok = 1;
+ iaux->iref_type = iaux->ntypes;
+ }
+ else
+ {
+ if ( j-i >= iaux->bca->min_support ) is_ok = 1;
+ // What is the best way to handle the -pmF options:
+ // - consider only sites where a single indel type passes the -mF threshold, as opposed to all indel types cumulatively
+ // - once a site passes, include all indel types in the evaluation, as opposed to considering only the strong candidates
+ // In this implementation sites are selected by counting reads from all indel types cumulatively and all indel types
+ // are considered.
+ // Uncomment the following condition to consider only strong indel candidates once the site has been selected
+ // if ( !iaux->bca->per_sample_flt && (double)(j-i) / n_tot < iaux->bca->min_frac ) is_ok = 0;
+ }
+ if ( is_ok )
+ {
+ iaux->types[iaux->ntypes++] = isize;
+ if ( isize > 0 && isize > iaux->max_ins_len ) iaux->max_ins_len = isize;
+ }
+ i = j-1;
+ }
+ if ( iaux->ntypes <= 1 ) return 0;
+
+ // Init insertion types, including their sequence
+ if ( iaux_init_ins_types(iaux) < 0 ) return -1;
+
+ iaux_init_sequence_context(iaux);
+
+ return iaux->ntypes;
+}
+#undef MINUS_CONST
+
+static int iaux_set_consensus(indel_aux_t *iaux, int ismpl)
+{
+ if ( !iaux->rcns )
+ iaux->rcns = rcns_init(iaux->pos, iaux->left, iaux->right);
+ else
+ rcns_reset(iaux->rcns, iaux->pos, iaux->left, iaux->right);
+
+ rcns_set_reads(iaux->rcns, iaux->plp[ismpl], iaux->nplp[ismpl]);
+
+ iaux->cns_seq = rcns_get_consensus(iaux->rcns, iaux->ref + iaux->left);
+
+// todo:
+// rcns should also collect localized number of mismatches as a substitute
+// for uninformative MQ. This would not affect calling but would help with
+// filtering
+
+ return 0;
+}
+
+#if 0
+// Finds the smallest index in the seq_pos array holding value equal to pos, or if there is no
+// such value, the largest index with value smaller than pos. Starts at initial guess ioff.
+// This could use a binary search but the assumption is that the initial guess is indel-size close
+// to the actuall coordinate.
+//
+// TODO: remove this function and seq_pos from cns creation as it seems unnecessary
+static int find_ref_offset(hts_pos_t pos, hts_pos_t *seq_pos, int nseq_pos, int ioff)
+{
+ if ( ioff<0 ) ioff = 0;
+ else if ( ioff >= nseq_pos ) ioff = nseq_pos - 1;
+ if ( seq_pos[ioff] < pos )
+ {
+ while ( ioff+1 < nseq_pos && seq_pos[ioff] < pos ) ioff++;
+ if ( seq_pos[ioff] > pos ) ioff--;
+ return ioff;
+ }
+ while ( ioff > 0 && seq_pos[ioff-1] >= pos ) ioff--;
+ return ioff;
+}
+#endif
+
+static int iaux_align_read(indel_aux_t *iaux, bam1_t *bam, uint8_t *ref_seq, int nref_seq)
+{
+ if ( bam->core.flag & BAM_FUNMAP ) return 1; // skip unmapped reads
+
+ // Trim both ref and qry to the window of interest
+ hts_pos_t ref_beg = iaux->left; // fa ref coordinates
+ hts_pos_t ref_end = iaux->right < ref_beg + nref_seq ? iaux->right : ref_beg + nref_seq - 1;
+
+ cigar_state_t cigar;
+ cstate_init(&cigar,bam);
+ int qry_off1, qry_off2, ref_off1, ref_off2;
+ if ( ref_beg > bam->core.pos )
+ {
+ // the read needs trimming from left
+ qry_off1 = cstate_seek_fwd(&cigar, &ref_beg, 1);
+ ref_off1 = ref_beg - iaux->left;
+
+ if ( ref_beg + (bam->core.l_qseq - qry_off1) > ref_end )
+ {
+ // the read needs trimming from right
+ qry_off2 = ref_end - ref_beg + qry_off1;
+ ref_off2 = ref_end - iaux->left;
+ }
+ else
+ {
+ // the ref template needs trimming from right
+ qry_off2 = bam->core.l_qseq - 1;
+ ref_off2 = ref_off1 + qry_off2 - qry_off1;
+ }
+ }
+ else
+ {
+ // the ref template needs trimming from left
+ qry_off1 = 0;
+ ref_off1 = bam->core.pos - ref_beg;
+
+ if ( bam->core.pos + bam->core.l_qseq - 1 > ref_end )
+ {
+ // the read needs trimming from right
+ ref_off2 = ref_end - iaux->left;
+ qry_off2 = ref_off2 - ref_off1;
+ }
+ else
+ {
+ // the ref template needs trimming from right
+ qry_off2 = bam->core.l_qseq - 1;
+ ref_off2 = ref_off1 + qry_off2 - qry_off1;
+ }
+ }
+//fprintf(bcftools_stderr,"xtrim: %s .. left,right=%d,%d rbeg,end=%d,%d qpos=%d qlen=%d qoff=%d,%d roff=%d,%d rlen=%d\n",bam_get_qname(bam),iaux->left,iaux->right,(int)ref_beg,(int)ref_end,(int)bam->core.pos,bam->core.l_qseq, qry_off1,qry_off2,ref_off1,ref_off2,nref_seq);
+
+ assert( qry_off1<=qry_off2 );
+ assert( qry_off1>=0 && qry_off1<bam->core.l_qseq );
+ assert( qry_off2>=0 && qry_off2<bam->core.l_qseq );
+
+ assert( ref_off1<=ref_off2 );
+ assert( ref_off1>=0 && ref_off1<nref_seq );
+ assert( ref_off2>=0 && ref_off2<nref_seq );
+
+ // prepare query sequence
+ int i, qlen = qry_off2 - qry_off1 + 1, rlen = ref_off2 - ref_off1 + 1;
+ if ( iaux->nqry_seq < qlen )
+ {
+ uint8_t *tmp = (uint8_t*) realloc(iaux->qry_seq, qlen);
+ if ( !tmp ) return -1; // critical error
+ iaux->qry_seq = tmp;
+ iaux->nqry_seq = qlen;
+ }
+ uint8_t *seq = bam_get_seq(bam);
+ for (i=qry_off1; i<=qry_off2; i++) iaux->qry_seq[i-qry_off1] = seq_nt16_int[bam_seqi(seq,i)];
+
+ // prepare qualities, either BQ or BAQ qualities (ZQ)
+ if ( iaux->nqual < qlen )
+ {
+ uint8_t *tmp = (uint8_t*) realloc(iaux->qual, qlen);
+ if ( !tmp ) return -1; // critical error
+ iaux->qual = tmp;
+ iaux->nqual = qlen;
+ }
+ uint8_t *qual = iaux->qual;
+ const uint8_t *qq = bam_get_qual(bam);
+ const uint8_t *bq = (uint8_t*)bam_aux_get(bam, "ZQ");
+ if ( bq ) bq++; // skip type
+ for (i=qry_off1; i<=qry_off2; i++)
+ {
+ int j = i - qry_off1;
+ qual[j] = bq ? qq[i] + (bq[i] - 64) : qq[i];
+ if ( qual[j] > 30 ) qual[j] = 30;
+ if ( qual[j] < 7 ) qual[j] = 7;
+ }
+
+// Illumina
+probaln_par_t apf = { 1e-4, 1e-2, 10 };
+
+ // align
+ int score = probaln_glocal(ref_seq + ref_off1, rlen, iaux->qry_seq, qlen, qual, &apf, 0, 0);
+ int adj_score = (int)(100. * score / qlen + .499) * iaux->bca->indel_bias;
+
+#if DEBUG_ALN
+ fprintf(bcftools_stderr,"aln: %d/%d\t%s\n\tref: ",score,adj_score,bam_get_qname(bam));
+ for (i=0; i<rlen; i++) fprintf(bcftools_stderr,"%c","ACGTN"[(int)ref_seq[ref_off1 + i]]);
+ fprintf(bcftools_stderr,"\n\tqry: ");
+ for (i=0; i<qlen; i++) fprintf(bcftools_stderr,"%c","ACGTN"[(int)iaux->qry_seq[i]]);
+ fprintf(bcftools_stderr,"\n\tqual: ");
+ for (i=0; i<qlen; i++) fprintf(bcftools_stderr,"%c",(char)(qual[i]+64));
+ fprintf(bcftools_stderr,"\n\ttrim: qry_len=%d qry_off=%d,%d ref_len=%d ref_off=%d,%d ref_beg,end=%d,%d\n",qlen,qry_off1,qry_off2,rlen,ref_off1,ref_off2,(int)ref_beg,(int)ref_end);
+#endif
+
+ if ( adj_score > 255 ) adj_score = 255;
+ return score<<8 | adj_score;
+}
+
+// Score all reads for this sample and indel type using the up to two consensus sequence templates.
+// On output sets iaux->read_scores[iread*ntypes+itype] = (raw_score<<8 | length_adjusted_score)
+static int iaux_score_reads(indel_aux_t *iaux, int ismpl, int itype)
+{
+ int i;
+ cns_seq_t *cns = iaux->cns_seq;
+ while ( cns->nseq )
+ {
+ // Resize buffers if necessary
+ int ref_len = cns->nseq + iaux->types[itype];
+ if ( iaux->nref_seq < ref_len )
+ {
+ uint8_t *ref_buf = (uint8_t*) realloc(iaux->ref_seq,sizeof(uint8_t)*ref_len);
+ if ( !ref_buf ) return -1;
+ iaux->ref_seq = ref_buf;
+ iaux->nref_seq = ref_len;
+ }
+
+ // Apply the indel and create the template ref sequence...
+ memcpy(iaux->ref_seq,cns->seq,(cns->ipos+1)*sizeof(*iaux->ref_seq));
+ if ( iaux->types[itype] < 0 ) // deletion
+ memcpy(iaux->ref_seq + cns->ipos + 1, cns->seq + cns->ipos + 1 - iaux->types[itype], (cns->nseq - cns->ipos - 1 + iaux->types[itype])*sizeof(*iaux->ref_seq));
+ else
+ {
+ char *ins = &iaux->inscns[itype*iaux->max_ins_len];
+ for (i=0; i<iaux->types[itype]; i++) iaux->ref_seq[cns->ipos+1+i] = ins[i];
+ memcpy(iaux->ref_seq + cns->ipos + 1 + iaux->types[itype], cns->seq + 1 + cns->ipos, (cns->nseq - cns->ipos - 1)*sizeof(*iaux->ref_seq));
+ }
+
+#if DEBUG_ALN
+ fprintf(bcftools_stderr,"template %d, type %d, sample %d: ",cns==iaux->cns_seq?0:1,itype,ismpl);
+ for (i=0; i<ref_len; i++) fprintf(bcftools_stderr,"%c","ACGTN"[(int)iaux->ref_seq[i]]);
+ fprintf(bcftools_stderr,"\n");
+#endif
+
+ // Align and score reads
+ for (i=0; i<iaux->nplp[ismpl]; i++)
+ {
+ const bam_pileup1_t *plp = iaux->plp[ismpl] + i;
+ int aln_score = iaux_align_read(iaux, plp->b, iaux->ref_seq, ref_len);
+ int *score = &iaux->read_scores[i*iaux->ntypes+itype];
+ if ( cns==iaux->cns_seq || *score > aln_score ) *score = aln_score;
+ }
+ cns++;
+ }
+ return 0;
+}
+
+// Determines indel quality for each read and populates 22 bits of pileup aux field with
+// three integers as follows
+// plp->aux = indel_type << 16 | seqQ << 8 | indelQ
+static int iaux_eval_scored_reads(indel_aux_t *iaux, int ismpl)
+{
+ int i,j;
+ for (i=0; i<iaux->nplp[ismpl]; i++)
+ {
+ bam_pileup1_t *plp = iaux->plp[ismpl] + i;
+
+ // Find the best indel type and the ref type, their scores difference is the indel quality
+ int *score = &iaux->read_scores[i*iaux->ntypes];
+ int alt_score = INT_MAX, alt_j = 0;
+ for (j=0; j<iaux->iref_type; j++)
+ if ( alt_score > score[j] ) alt_score = score[j], alt_j = j;
+ for (j=iaux->iref_type+1; j<iaux->ntypes; j++)
+ if ( alt_score > score[j] ) alt_score = score[j], alt_j = j;
+ int ref_score = score[iaux->iref_type];
+ int sc0, sc1, j0;
+ if ( alt_score < ref_score ) sc0 = alt_score, sc1 = ref_score, j0 = alt_j;
+ else sc0 = ref_score, sc1 = alt_score, j0 = iaux->iref_type;
+
+ int indelQ = (sc1>>8) - (sc0>>8); // low=bad, high=good
+ int seqQ = iaux->ref_qual[alt_j];
+
+ // Reduce indelQ. High length-normalized alignment scores (i.e. bad alignments)
+ // lower the quality more (e.g. gnuplot> plot [0:111] (1-x/111.)*255)
+ int len_normQ = sc0 & 0xff; // length-normalized score of the best match (ref or alt)
+ int adj_indelQ; // final indelQ used in calling
+ if ( len_normQ > 111 )
+ {
+ // In the original code reads matching badly to any indel type or reference had indelQ set to 0
+ // here and thus would be effectively removed from calling. This leads to problems when there are
+ // many soft clipped reads and a few good matching indel reads (see noisy-softclips.bam in
+ // mpileup-tests). Only the few good quality indel reads would become visible to the caller and
+ // the indel would be called with high quality. Here we change the logic to make the badly matching
+ // reads low quality reference reads. The threshold was set to make the test case still be called
+ // as an indel, but with very low quality.
+ //
+ // Original code:
+ // adj_indelQ = 0;
+ //
+ adj_indelQ = 12;
+ j0 = iaux->iref_type;
+ }
+ else
+ adj_indelQ = (int)((1. - len_normQ/111.) * indelQ + .499);
+
+#if DEBUG_ALN
+ // Prints the selected indel type (itype); adjusted indelQ which will be used if bigger than seqQ;
+ // raw indelQ; length-normalized indelQ and sequence context quality; ref and best alt indel type
+ // and their raw and length-normalized scores
+ fprintf(bcftools_stderr,"itype=%d adj_indelQ=%d\trawQ=%d\tlen_normQ=%d\tseqQ=%d\tref:%d=%d/%d alt:%d=%d/%d)\t%s\n",
+ j0,adj_indelQ,indelQ,len_normQ,seqQ,iaux->iref_type,ref_score>>8,ref_score&0xff,alt_j,alt_score>>8,alt_score&0xff,bam_get_qname(plp->b));
+#endif
+
+ if ( adj_indelQ > seqQ ) adj_indelQ = seqQ; // seqQ already capped at 255
+ plp->aux = j0<<16 | seqQ<<8 | adj_indelQ; // use 22 bits in total
+ iaux->sum_qual[j0] += adj_indelQ;
+ }
+ return 0;
+}
+
+// Find the best indel types, include the ref type plus maximum three alternate indel alleles.
+static int iaux_eval_best_indels(indel_aux_t *iaux)
+{
+ bcf_callaux_t *bca = iaux->bca;
+ bca->maxins = iaux->max_ins_len;
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+ if ( bca->maxins && !bca->inscns ) return -1;
+
+ // insertion sort, descending, high-quality indels come first
+ int i,j,t, tmp, *sumq = iaux->sum_qual, ntypes = iaux->ntypes;
+ for (t=0; t<ntypes; t++) sumq[t] = sumq[t]<<6 | t;
+ for (t=1; t<ntypes; t++)
+ for (j=t; j>0 && sumq[j] > sumq[j-1]; j--)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t=0; t<ntypes; t++) // look for the reference type
+ if ( (sumq[t]&0x3f)==iaux->iref_type ) break;
+ if ( t )
+ {
+ // move the reference type to the first
+ tmp = sumq[t];
+ for (; t>0; t--) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+
+ // Initialize bca's structures and create a mapping between old and new types
+ int old2new_type[MAX_TYPES];
+ for (t=0; t<iaux->ntypes; t++)
+ {
+ int itype = sumq[t] & 0x3f;
+ old2new_type[itype] = t;
+ if ( t>=4 ) continue;
+ bca->indel_types[t] = iaux->types[itype];
+ if ( bca->indel_types[t] <= 0 ) continue;
+ memcpy(&bca->inscns[t*bca->maxins], &iaux->inscns[itype*iaux->max_ins_len], bca->maxins);
+ }
+
+ // Update indel type in plp->aux for all reads
+ int ismpl, n_alt = 0;
+ for (ismpl=0; ismpl<iaux->nsmpl; ismpl++)
+ {
+ for (i=0; i<iaux->nplp[ismpl]; i++)
+ {
+ bam_pileup1_t *plp = iaux->plp[ismpl] + i;
+ int itype_old = (plp->aux >> 16) & 0x3f;
+ int itype_new = old2new_type[itype_old];
+ plp->aux = itype_new<<16 | (itype_new>=4 ? 0 : (plp->aux & 0xffff));
+ if ( itype_new>0 ) n_alt++;
+ }
+ }
+ return n_alt;
+}
+
+/*
+ notes:
+ - n .. number of samples
+ - the routine sets bam_pileup1_t.aux (27 bits) of each read as follows:
+ - 5: unused
+ - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
+ - 8: estimated sequence quality .. (aux>>8)&0xff
+ - 8: indel quality .. aux&0xff
+ */
+int bcf_iaux_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+{
+assert(!(ref == 0 || bca == 0)); // can this ever happen? when?
+ if (ref == 0 || bca == 0) return -1;
+
+ if ( !bca->iaux ) bca->iaux = calloc(1,sizeof(indel_aux_t));
+ indel_aux_t *iaux = bca->iaux;
+ iaux->nsmpl = n;
+ iaux->nplp = n_plp;
+ iaux->plp = plp;
+ iaux->bca = bca;
+ iaux->ref = ref;
+ iaux->pos = pos;
+ iaux->chr = bca->chr;
+
+ // Check if there is an indel at this position and if yes, find all indel types and determine
+ // window boundaries. todo: We want this information cached so that for long reads we don't keep
+ // redoing the whole analysis again and again
+ int ntypes = iaux_init_types(iaux);
+ if ( ntypes<=0 ) return -1;
+
+ debug_print_types(iaux);
+
+ // Create two template consensus sequences for each sample (assuming max diploid organism).
+ // Then apply each indel type on top of the templates, realign every read and remember score
+ int i,j;
+ for (i=0; i<iaux->nsmpl; i++)
+ {
+ iaux_set_consensus(iaux, i);
+ iaux_init_scores(iaux, i);
+ for (j=0; j<ntypes; j++) iaux_score_reads(iaux, i, j);
+ iaux_eval_scored_reads(iaux, i);
+ }
+ int nalt = iaux_eval_best_indels(iaux);
+ return nalt > 0 ? 0 : -1;
+}
+
return q < qh? q : qh;
}
-static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
+inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
{
int i, j, max = 0, max_i = pos, score = 0;
l = abs(l);
}
// The length of the homopolymer run around the current position
-static int bcf_cgp_l_run(const char *ref, int pos) {
+int bcf_cgp_l_run(const char *ref, int pos) {
int i, l_run;
int c = seq_nt16_table[(int)ref[pos + 1]];
fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s "
"qbeg=%d tbeg=%d score=%d\n",
pos, types[t], s, i, bam_get_qname(p->b),
- qbeg, tbeg, sc);
+ qbeg, tbeg, score[K*n_types + t]);
#endif
}
}
return q < qh? q : qh;
}
-static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
+inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
{
int i, j, max = 0, max_i = pos, score = 0;
l = abs(l);
}
// The length of the homopolymer run around the current position
-static int bcf_cgp_l_run(const char *ref, int pos) {
+int bcf_cgp_l_run(const char *ref, int pos) {
int i, l_run;
int c = seq_nt16_table[(int)ref[pos + 1]];
fprintf(bcftools_stderr, "pos=%d type=%d read=%d:%d name=%s "
"qbeg=%d tbeg=%d score=%d\n",
pos, types[t], s, i, bam_get_qname(p->b),
- qbeg, tbeg, sc);
+ qbeg, tbeg, score[K*n_types + t]);
#endif
}
}
int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq);
int parse_overlap_option(const char *arg);
-void *smalloc(size_t size); // safe malloc
-
static inline int iupac2bitmask(char iupac)
{
const int A = 1;
13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,0,7,9,0,10
};
if ( iupac > 89 ) return 0;
- if ( nt > 90 ) nt -= 32; // lowercase
+ if ( nt > 90 ) nt -= 32; // lowercase
if ( nt=='A' ) nt = 1;
else if ( nt=='C' ) nt = 2;
else if ( nt=='G' ) nt = 4;
--- /dev/null
+/* cigar_state.h -- API for efficient parsing of CIGAR strings
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: pd3@sanger
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE. */
+
+#ifndef CIGAR_STATE_H
+#define CIGAR_STATE_H
+
+#include <stdint.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+
+typedef struct
+{
+ bam1_t *bam;
+ uint32_t *cigar;
+ uint8_t *seq;
+ int ncig;
+ int icig; // position in the cigar string
+ int iseq; // the cigar[icigar] operation refers to &seq[iseq]
+ hts_pos_t ref_pos; // reference coordinate, corresponds to iseq; points to
+ // the first base after the read when consumed
+}
+cigar_state_t;
+
+static inline void cstate_init(cigar_state_t *cs, bam1_t *bam)
+{
+ cs->bam = bam;
+ cs->cigar = bam_get_cigar(bam);
+ cs->seq = bam_get_seq(bam);
+ cs->ncig = bam->core.n_cigar;
+ cs->icig = 0;
+ cs->iseq = 0;
+ cs->ref_pos = bam->core.pos;
+}
+
+/**
+ * cstate_seek_fwd() - Move in the cigar forward to find query index that
+ * matches the reference position.
+ *
+ * When the position is not contained within the sequence, either because there
+ * is a deletion or there is no overlap, the behavior is controlled by the value
+ * of trim_left:
+ * - read starts after: qry_beg > pos && trim_left=1 .. returns 0 and sets pos to qry_beg
+ * - read starts after: qry_beg > pos && trim_left=0 .. returns -1
+ * - read ends before: qry_end < pos && trim_left=1 .. returns -2
+ * - read ends before: qry_end < pos && trim_left=0 .. returns qry_len-1 and sets pos to qry_end
+ * - pos inside a deletion && trim_left=1 .. returns position after the deletion
+ * - pos inside a deletion && trim_left=0 .. returns position before the deletion
+ */
+static inline int cstate_seek_fwd(cigar_state_t *cs, hts_pos_t *pos_ptr, int trim_left)
+{
+ hts_pos_t pos = *pos_ptr;
+ while ( cs->ref_pos <= pos )
+ {
+ if ( cs->icig >= cs->ncig ) // the read ends before pos
+ {
+ if ( trim_left ) return -2;
+ *pos_ptr = cs->ref_pos - 1;
+ return cs->iseq - 1;
+ }
+
+ int op = cs->cigar[cs->icig] & BAM_CIGAR_MASK;
+ int len = cs->cigar[cs->icig] >> BAM_CIGAR_SHIFT;
+ if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF )
+ {
+ if ( cs->ref_pos + len > pos ) return pos - cs->ref_pos + cs->iseq; // the cigar op overlaps pos
+ cs->ref_pos += len;
+ cs->iseq += len;
+ cs->icig++;
+ continue;
+ }
+ if ( op==BAM_CINS || op==BAM_CSOFT_CLIP )
+ {
+ cs->iseq += len;
+ cs->icig++;
+ continue;
+ }
+ if ( op==BAM_CDEL || op==BAM_CREF_SKIP )
+ {
+ if ( cs->ref_pos + len > pos )
+ {
+ // The deletion overlaps the position. NB: assuming del is never the first or last op
+ *pos_ptr = trim_left ? cs->ref_pos + len : cs->ref_pos - 1;
+ return trim_left ? cs->iseq : cs->iseq - 1;
+ }
+ cs->ref_pos += len;
+ cs->icig++;
+ continue;
+ }
+ }
+ // the read starts after pos
+ if ( trim_left )
+ {
+ *pos_ptr = cs->bam->core.pos;
+ return 0;
+ }
+ return -1;
+}
+
+
+/**
+ * cstate_seek_op_fwd() - Move in the cigar forward to find query index that
+ * matches the seek operator and the reference position.
+ *
+ * In order to match a deletion, pass the position of the first deleted base.
+ * In order to match an insertion, pass the reference coordinate of the base
+ * after the inserted sequence.
+ *
+ * Returns the index to the query sequence cs->seq
+ * on success; -1 when there is no such matching position but the cigar
+ * is still not entirely consumed (e.g. a deletion or a soft-clip); -2
+ * when there is no overlap (i.e. the read ends before the position).
+ */
+static inline int cstate_seek_op_fwd(cigar_state_t *cs, hts_pos_t pos, int seek_op, int *oplen)
+{
+ while ( cs->ref_pos <= pos )
+ {
+ if ( cs->icig >= cs->ncig ) return -2;
+
+ int op = cs->cigar[cs->icig] & BAM_CIGAR_MASK;
+ int len = cs->cigar[cs->icig] >> BAM_CIGAR_SHIFT;
+ if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF )
+ {
+ if ( cs->ref_pos + len <= pos )
+ {
+ cs->ref_pos += len;
+ cs->iseq += len;
+ cs->icig++;
+ continue;
+ }
+ if ( seek_op==BAM_CMATCH ) return pos - cs->ref_pos + cs->iseq;
+ return -1;
+ }
+ if ( op==BAM_CINS || op==BAM_CSOFT_CLIP )
+ {
+ if ( cs->ref_pos == pos && seek_op==op )
+ {
+ if ( oplen ) *oplen = len;
+ return cs->iseq;
+ }
+ if ( cs->ref_pos >= pos ) return -1;
+ cs->iseq += len;
+ cs->icig++;
+ continue;
+ }
+ if ( op==BAM_CDEL || op==BAM_CREF_SKIP )
+ {
+ if ( cs->ref_pos == pos && seek_op==op )
+ {
+ if ( oplen ) *oplen = len;
+ return cs->iseq;
+ }
+ if ( cs->ref_pos >= pos ) return -1;
+ cs->ref_pos += len;
+ cs->icig++;
+ continue;
+ }
+ }
+ return cs->icig < cs->ncig ? -1 : -2;
+}
+
+#endif
/* The MIT License
- Copyright (c) 2014-2022 Genome Research Ltd.
+ Copyright (c) 2014-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "bcftools.h"
#include "rbuf.h"
#include "filter.h"
+#include "smpl_ilist.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
FILE *fp_out;
FILE *fp_chain;
char **argv;
- int argc, output_iupac, haplotype, allele, isample, napplied;
- uint8_t *iupac_bitmask;
- int miupac_bitmask;
- char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
+ int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied;
+ uint8_t *iupac_bitmask, *iupac_als;
+ int miupac_bitmask, miupac_als;
+ char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
char mark_del, mark_ins, mark_snv;
+ smpl_ilist_t *smpl;
}
args_t;
if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum));
args->hdr = args->files->readers[0].header;
args->isample = -1;
- if ( args->sample )
+ if ( !args->sample )
+ args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
+ else if ( args->sample && strcmp("-",args->sample) )
{
- args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
- if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
+ args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE);
+ if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
}
- if ( (args->haplotype || args->allele) && args->isample<0 )
+ else if ( args->sample_fname )
{
- if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
- args->isample = 0;
+ args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE);
+ if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
+ }
+ if ( args->smpl )
+ {
+ if ( args->haplotype || args->allele )
+ {
+ if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n");
+ args->isample = args->smpl->idx[0];
+ }
+ else
+ args->iupac_GTs = 1;
}
int i;
for (i=0; i<args->nmask; i++)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = stdout;
- if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
+ if ( args->isample<0 && !args->iupac_GTs ) fprintf(stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
args->rid = -1;
}
static void destroy_data(args_t *args)
{
+ free(args->iupac_als);
free(args->iupac_bitmask);
if (args->filter) filter_destroy(args->filter);
+ if ( args->smpl ) smpl_ilist_destroy(args->smpl);
bcf_sr_destroy(args->files);
int i;
for (i=0; i<args->vcf_rbuf.m; i++)
if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
}
}
+static void iupac_init(args_t *args, bcf1_t *rec)
+{
+ int i;
+ hts_resize(uint8_t, rec->n_allele, &args->miupac_als, &args->iupac_als, 0);
+ for (i=0; i<args->miupac_als; i++) args->iupac_als[i] = 0;
+}
+static int iupac_add_gt(args_t *args, bcf1_t *rec, uint8_t *gt, int ngt)
+{
+ int i, is_set = 0;
+ for (i=0; i<ngt; i++)
+ {
+ if ( bcf_gt_is_missing(gt[i]) ) continue;
+ if ( gt[i]==(uint8_t)bcf_int8_vector_end ) break;
+ int ial = bcf_gt_allele(gt[i]);
+ if ( ial >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ args->iupac_als[ial] = 1;
+ is_set = 1;
+ }
+ return is_set;
+}
+static int iupac_set_allele(args_t *args, bcf1_t *rec)
+{
+ int i,j, max_len = 0, alt_len = 0, ialt = -1, fallback_alt = -1;
+ for (i=0; i<rec->n_allele; i++)
+ {
+ if ( !args->iupac_als[i] ) continue;
+ if ( fallback_alt <=0 ) fallback_alt = i;
+ int l = strlen(rec->d.allele[i]);
+ for (j=0; j<l; j++)
+ if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
+ if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+ if ( l>max_len )
+ {
+ hts_resize(uint8_t, l, &args->miupac_bitmask, &args->iupac_bitmask, HTS_RESIZE_CLEAR);
+ for (j=max_len; j<l; j++) args->iupac_bitmask[j] = 0;
+ max_len = l;
+ }
+ if ( i>0 && l>alt_len )
+ {
+ alt_len = l;
+ ialt = i;
+ }
+ for (j=0; j<l; j++)
+ args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
+ }
+ if ( alt_len > 0 )
+ {
+ for (j=0; j<alt_len; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+ return ialt;
+ }
+ if ( fallback_alt >= 0 ) return fallback_alt;
+ return ialt;
+}
static void apply_variant(args_t *args, bcf1_t *rec)
{
static int warned_haplotype = 0;
}
int ialt = 1; // the alternate allele
- if ( args->isample >= 0 )
+ if ( args->iupac_GTs )
+ {
+ bcf_unpack(rec, BCF_UN_FMT);
+ bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
+ if ( !fmt ) return;
+ if ( fmt->type!=BCF_BT_INT8 )
+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ ialt = -1;
+ int is_set = 0;
+ iupac_init(args,rec);
+ for (i=0; i<args->smpl->n; i++)
+ {
+ uint8_t *ptr = fmt->p + fmt->size*args->smpl->idx[i];
+ is_set += iupac_add_gt(args, rec, ptr, fmt->n);
+ }
+ if ( !is_set && !args->missing_allele ) return;
+ if ( is_set ) ialt = iupac_set_allele(args, rec);
+ }
+ else if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
else if ( action==use_iupac )
{
ialt = -1;
- int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
- for (i=0; i<fmt->n; i++)
- {
- if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
- if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
- int jalt = bcf_gt_allele(ptr[i]);
- if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
- if ( fallback_alt <= 0 ) fallback_alt = jalt;
-
- int l = strlen(rec->d.allele[jalt]);
- for (j=0; j<l; j++)
- if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
- if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
-
- if ( l > mlen )
- {
- hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
- for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
- mlen = l;
- }
- if ( jalt>0 && l>alen )
- {
- alen = l;
- ialt = jalt;
- }
- for (j=0; j<l; j++)
- args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
- }
- if ( alen > 0 )
- for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
- else if ( fallback_alt >= 0 )
- ialt = fallback_alt;
- else if ( is_missing && !args->missing_allele ) return;
+ iupac_init(args,rec);
+ int is_set = iupac_add_gt(args, rec, ptr, fmt->n);
+ if ( !is_set && !args->missing_allele ) return;
+ if ( is_set ) ialt = iupac_set_allele(args, rec);
}
else
{
fprintf(stderr, "\n");
fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
fprintf(stderr, " file. By default, the program will apply all ALT variants. Using the\n");
- fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
+ fprintf(stderr, " --samples (and, optionally, --haplotype) option will apply genotype\n");
fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(stderr, "Options:\n");
- fprintf(stderr, " -c, --chain FILE write a chain file for liftover\n");
- fprintf(stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n");
- fprintf(stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n");
- fprintf(stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(stderr, " -c, --chain FILE Write a chain file for liftover\n");
+ fprintf(stderr, " -a, --absent CHAR Replace positions absent from VCF with CHAR\n");
+ fprintf(stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
+ fprintf(stderr, " -H, --haplotype WHICH Choose which allele to use from the FORMAT/GT field, note\n");
fprintf(stderr, " the codes are case-insensitive:\n");
fprintf(stderr, " 1: first allele from GT, regardless of phasing\n");
fprintf(stderr, " 2: second allele from GT, regardless of phasing\n");
fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
fprintf(stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
- fprintf(stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
- fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
- fprintf(stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n");
- fprintf(stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
- fprintf(stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
- fprintf(stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n");
- fprintf(stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
- fprintf(stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n");
- fprintf(stderr, " -o, --output FILE write output to a file [standard output]\n");
- fprintf(stderr, " -p, --prefix STRING prefix to add to output sequence names\n");
- fprintf(stderr, " -s, --sample NAME apply variants of the given sample\n");
+ fprintf(stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(stderr, " -I, --iupac-codes Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
+ fprintf(stderr, " --mark-del CHAR Instead of removing sequence, insert CHAR for deletions\n");
+ fprintf(stderr, " --mark-ins uc|lc Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(stderr, " --mark-snv uc|lc Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(stderr, " -m, --mask FILE Replace regions according to the next --mask-with option. The default is --mask-with N\n");
+ fprintf(stderr, " --mask-with CHAR|uc|lc Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
+ fprintf(stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n");
+ fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(stderr, " -p, --prefix STRING Prefix to add to output sequence names\n");
+ fprintf(stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
+ fprintf(stderr, " -S, --samples-file FILE File of samples to include\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
fprintf(stderr, " # in the form \">chr:from-to\".\n");
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
{"iupac-codes",0,0,'I'},
{"haplotype",1,0,'H'},
{"output",1,0,'o'},
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
switch (c)
{
break;
case 'p': args->chr_prefix = optarg; break;
case 's': args->sample = optarg; break;
+ case 'S': args->sample_fname = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
case 'e':
/* The MIT License
- Copyright (c) 2014-2022 Genome Research Ltd.
+ Copyright (c) 2014-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "bcftools.h"
#include "rbuf.h"
#include "filter.h"
+#include "smpl_ilist.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
FILE *fp_out;
FILE *fp_chain;
char **argv;
- int argc, output_iupac, haplotype, allele, isample, napplied;
- uint8_t *iupac_bitmask;
- int miupac_bitmask;
- char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
+ int argc, output_iupac, iupac_GTs, haplotype, allele, isample, napplied;
+ uint8_t *iupac_bitmask, *iupac_als;
+ int miupac_bitmask, miupac_als;
+ char *fname, *ref_fname, *sample, *sample_fname, *output_fname, *mask_fname, *chain_fname, missing_allele, absent_allele;
char mark_del, mark_ins, mark_snv;
+ smpl_ilist_t *smpl;
}
args_t;
if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to read from %s: %s\n", !strcmp("-",args->fname)?"standard input":args->fname, bcf_sr_strerror(args->files->errnum));
args->hdr = args->files->readers[0].header;
args->isample = -1;
- if ( args->sample )
+ if ( !args->sample )
+ args->smpl = smpl_ilist_init(args->hdr,NULL,0,SMPL_NONE|SMPL_VERBOSE);
+ else if ( args->sample && strcmp("-",args->sample) )
{
- args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
- if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
+ args->smpl = smpl_ilist_init(args->hdr,args->sample,0,SMPL_NONE|SMPL_VERBOSE);
+ if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
}
- if ( (args->haplotype || args->allele) && args->isample<0 )
+ else if ( args->sample_fname )
{
- if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
- args->isample = 0;
+ args->smpl = smpl_ilist_init(args->hdr,args->sample_fname,1,SMPL_NONE|SMPL_VERBOSE);
+ if ( args->smpl && !args->smpl->n ) error("No matching sample found\n");
+ }
+ if ( args->smpl )
+ {
+ if ( args->haplotype || args->allele )
+ {
+ if ( args->smpl->n > 1 ) error("Too many samples, only one can be used with -H\n");
+ args->isample = args->smpl->idx[0];
+ }
+ else
+ args->iupac_GTs = 1;
}
int i;
for (i=0; i<args->nmask; i++)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = bcftools_stdout;
- if ( args->isample<0 ) fprintf(bcftools_stderr,"Note: the --sample option not given, applying all records regardless of the genotype\n");
+ if ( args->isample<0 && !args->iupac_GTs ) fprintf(bcftools_stderr,"Note: the --samples option not given, applying all records regardless of the genotype\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
args->rid = -1;
}
static void destroy_data(args_t *args)
{
+ free(args->iupac_als);
free(args->iupac_bitmask);
if (args->filter) filter_destroy(args->filter);
+ if ( args->smpl ) smpl_ilist_destroy(args->smpl);
bcf_sr_destroy(args->files);
int i;
for (i=0; i<args->vcf_rbuf.m; i++)
if ( tolower(ref[i])!=tolower(alt[i]) ) alt[i] = toupper(alt[i]);
}
}
+static void iupac_init(args_t *args, bcf1_t *rec)
+{
+ int i;
+ hts_resize(uint8_t, rec->n_allele, &args->miupac_als, &args->iupac_als, 0);
+ for (i=0; i<args->miupac_als; i++) args->iupac_als[i] = 0;
+}
+static int iupac_add_gt(args_t *args, bcf1_t *rec, uint8_t *gt, int ngt)
+{
+ int i, is_set = 0;
+ for (i=0; i<ngt; i++)
+ {
+ if ( bcf_gt_is_missing(gt[i]) ) continue;
+ if ( gt[i]==(uint8_t)bcf_int8_vector_end ) break;
+ int ial = bcf_gt_allele(gt[i]);
+ if ( ial >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ args->iupac_als[ial] = 1;
+ is_set = 1;
+ }
+ return is_set;
+}
+static int iupac_set_allele(args_t *args, bcf1_t *rec)
+{
+ int i,j, max_len = 0, alt_len = 0, ialt = -1, fallback_alt = -1;
+ for (i=0; i<rec->n_allele; i++)
+ {
+ if ( !args->iupac_als[i] ) continue;
+ if ( fallback_alt <=0 ) fallback_alt = i;
+ int l = strlen(rec->d.allele[i]);
+ for (j=0; j<l; j++)
+ if ( iupac2bitmask(rec->d.allele[i][j]) < 0 ) break;
+ if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
+ if ( l>max_len )
+ {
+ hts_resize(uint8_t, l, &args->miupac_bitmask, &args->iupac_bitmask, HTS_RESIZE_CLEAR);
+ for (j=max_len; j<l; j++) args->iupac_bitmask[j] = 0;
+ max_len = l;
+ }
+ if ( i>0 && l>alt_len )
+ {
+ alt_len = l;
+ ialt = i;
+ }
+ for (j=0; j<l; j++)
+ args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[i][j]);
+ }
+ if ( alt_len > 0 )
+ {
+ for (j=0; j<alt_len; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
+ return ialt;
+ }
+ if ( fallback_alt >= 0 ) return fallback_alt;
+ return ialt;
+}
static void apply_variant(args_t *args, bcf1_t *rec)
{
static int warned_haplotype = 0;
}
int ialt = 1; // the alternate allele
- if ( args->isample >= 0 )
+ if ( args->iupac_GTs )
+ {
+ bcf_unpack(rec, BCF_UN_FMT);
+ bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
+ if ( !fmt ) return;
+ if ( fmt->type!=BCF_BT_INT8 )
+ error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%"PRId64"?\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
+ ialt = -1;
+ int is_set = 0;
+ iupac_init(args,rec);
+ for (i=0; i<args->smpl->n; i++)
+ {
+ uint8_t *ptr = fmt->p + fmt->size*args->smpl->idx[i];
+ is_set += iupac_add_gt(args, rec, ptr, fmt->n);
+ }
+ if ( !is_set && !args->missing_allele ) return;
+ if ( is_set ) ialt = iupac_set_allele(args, rec);
+ }
+ else if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
else if ( action==use_iupac )
{
ialt = -1;
- int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
- for (i=0; i<fmt->n; i++)
- {
- if ( bcf_gt_is_missing(ptr[i]) ) { is_missing = 1; continue; }
- if ( ptr[i]==(uint8_t)bcf_int8_vector_end ) break;
- int jalt = bcf_gt_allele(ptr[i]);
- if ( jalt >= rec->n_allele ) error("Invalid VCF, too few ALT alleles at %s:%"PRId64"\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
- if ( fallback_alt <= 0 ) fallback_alt = jalt;
-
- int l = strlen(rec->d.allele[jalt]);
- for (j=0; j<l; j++)
- if ( iupac2bitmask(rec->d.allele[jalt][j]) < 0 ) break;
- if ( j<l ) continue; // symbolic allele, breakpoint or invalid character in the allele
-
- if ( l > mlen )
- {
- hts_expand(uint8_t,l,args->miupac_bitmask,args->iupac_bitmask);
- for (j=mlen; j<l; j++) args->iupac_bitmask[j] = 0;
- mlen = l;
- }
- if ( jalt>0 && l>alen )
- {
- alen = l;
- ialt = jalt;
- }
- for (j=0; j<l; j++)
- args->iupac_bitmask[j] |= iupac2bitmask(rec->d.allele[jalt][j]);
- }
- if ( alen > 0 )
- for (j=0; j<alen; j++) rec->d.allele[ialt][j] = bitmask2iupac(args->iupac_bitmask[j]);
- else if ( fallback_alt >= 0 )
- ialt = fallback_alt;
- else if ( is_missing && !args->missing_allele ) return;
+ iupac_init(args,rec);
+ int is_set = iupac_add_gt(args, rec, ptr, fmt->n);
+ if ( !is_set && !args->missing_allele ) return;
+ if ( is_set ) ialt = iupac_set_allele(args, rec);
}
else
{
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
fprintf(bcftools_stderr, " file. By default, the program will apply all ALT variants. Using the\n");
- fprintf(bcftools_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
+ fprintf(bcftools_stderr, " --samples (and, optionally, --haplotype) option will apply genotype\n");
fprintf(bcftools_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
fprintf(bcftools_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(bcftools_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf.gz>\n");
fprintf(bcftools_stderr, "Options:\n");
- fprintf(bcftools_stderr, " -c, --chain FILE write a chain file for liftover\n");
- fprintf(bcftools_stderr, " -a, --absent CHAR replace positions absent from VCF with CHAR\n");
- fprintf(bcftools_stderr, " -e, --exclude EXPR exclude sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -f, --fasta-ref FILE reference sequence in fasta format\n");
- fprintf(bcftools_stderr, " -H, --haplotype WHICH choose which allele to use from the FORMAT/GT field, note\n");
+ fprintf(bcftools_stderr, " -c, --chain FILE Write a chain file for liftover\n");
+ fprintf(bcftools_stderr, " -a, --absent CHAR Replace positions absent from VCF with CHAR\n");
+ fprintf(bcftools_stderr, " -e, --exclude EXPR Exclude sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -f, --fasta-ref FILE Reference sequence in fasta format\n");
+ fprintf(bcftools_stderr, " -H, --haplotype WHICH Choose which allele to use from the FORMAT/GT field, note\n");
fprintf(bcftools_stderr, " the codes are case-insensitive:\n");
fprintf(bcftools_stderr, " 1: first allele from GT, regardless of phasing\n");
fprintf(bcftools_stderr, " 2: second allele from GT, regardless of phasing\n");
fprintf(bcftools_stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
fprintf(bcftools_stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
fprintf(bcftools_stderr, " 1pIu,2pIu: first/second allele for phased and IUPAC code for unphased GTs\n");
- fprintf(bcftools_stderr, " -i, --include EXPR select sites for which the expression is true (see man page for details)\n");
- fprintf(bcftools_stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
- fprintf(bcftools_stderr, " --mark-del CHAR instead of removing sequence, insert CHAR for deletions\n");
- fprintf(bcftools_stderr, " --mark-ins uc|lc highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
- fprintf(bcftools_stderr, " --mark-snv uc|lc highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
- fprintf(bcftools_stderr, " -m, --mask FILE replace regions according to the next --mask-with option. The default is --mask-with N\n");
- fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
- fprintf(bcftools_stderr, " -M, --missing CHAR output CHAR instead of skipping a missing genotype \"./.\"\n");
- fprintf(bcftools_stderr, " -o, --output FILE write output to a file [standard output]\n");
- fprintf(bcftools_stderr, " -p, --prefix STRING prefix to add to output sequence names\n");
- fprintf(bcftools_stderr, " -s, --sample NAME apply variants of the given sample\n");
+ fprintf(bcftools_stderr, " -i, --include EXPR Select sites for which the expression is true (see man page for details)\n");
+ fprintf(bcftools_stderr, " -I, --iupac-codes Output IUPAC codes based on FORMAT/GT, use -s/-S to subset samples\n");
+ fprintf(bcftools_stderr, " --mark-del CHAR Instead of removing sequence, insert CHAR for deletions\n");
+ fprintf(bcftools_stderr, " --mark-ins uc|lc Highlight insertions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(bcftools_stderr, " --mark-snv uc|lc Highlight substitutions in uppercase (uc) or lowercase (lc), leaving the rest as is\n");
+ fprintf(bcftools_stderr, " -m, --mask FILE Replace regions according to the next --mask-with option. The default is --mask-with N\n");
+ fprintf(bcftools_stderr, " --mask-with CHAR|uc|lc Replace with CHAR (skips overlapping variants); change to uppercase (uc) or lowercase (lc)\n");
+ fprintf(bcftools_stderr, " -M, --missing CHAR Output CHAR instead of skipping a missing genotype \"./.\"\n");
+ fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
+ fprintf(bcftools_stderr, " -p, --prefix STRING Prefix to add to output sequence names\n");
+ fprintf(bcftools_stderr, " -s, --samples LIST Comma-separated list of samples to include, \"-\" to ignore samples and use REF,ALT\n");
+ fprintf(bcftools_stderr, " -S, --samples-file FILE File of samples to include\n");
fprintf(bcftools_stderr, "Examples:\n");
fprintf(bcftools_stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n");
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
{"iupac-codes",0,0,'I'},
{"haplotype",1,0,'H'},
{"output",1,0,'o'},
{0,0,0,0}
};
int c;
- while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?s:S:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
switch (c)
{
break;
case 'p': args->chr_prefix = optarg; break;
case 's': args->sample = optarg; break;
+ case 'S': args->sample_fname = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
case 'e':
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
+ const char *alt = NULL;
+ size_t sizealt = 0;
+ if ( line->n_allele>1 )
+ {
+ alt = line->d.allele[1];
+ sizealt = strlen(line->d.allele[1]);
+ }
uint64_t vk = variantkey(
convert->header->id[BCF_DT_CTG][line->rid].key,
strlen(convert->header->id[BCF_DT_CTG][line->rid].key),
line->pos,
line->d.allele[0],
strlen(line->d.allele[0]),
- line->d.allele[1],
- strlen(line->d.allele[1]));
+ alt,
+ sizealt);
ksprintf(str, "%016" PRIx64 "", vk);
}
if ( i!=convert->nfmt )
return str->l - l_ori;
- kputs("# ", str);
+ kputc('#', str);
for (i=0; i<convert->nfmt; i++)
{
// Genotype fields
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
static void process_variantkey_hex(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
+ const char *alt = NULL;
+ size_t sizealt = 0;
+ if ( line->n_allele>1 )
+ {
+ alt = line->d.allele[1];
+ sizealt = strlen(line->d.allele[1]);
+ }
uint64_t vk = variantkey(
convert->header->id[BCF_DT_CTG][line->rid].key,
strlen(convert->header->id[BCF_DT_CTG][line->rid].key),
line->pos,
line->d.allele[0],
strlen(line->d.allele[0]),
- line->d.allele[1],
- strlen(line->d.allele[1]));
+ alt,
+ sizealt);
ksprintf(str, "%016" PRIx64 "", vk);
}
if ( i!=convert->nfmt )
return str->l - l_ori;
- kputs("# ", str);
+ kputc('#', str);
for (i=0; i<convert->nfmt; i++)
{
// Genotype fields
/* The MIT License
- Copyright (c) 2016-2021 Genome Research Ltd.
+ Copyright (c) 2016-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
C .. corresponding CDS, exon, and UTR lines:
- C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
-
-
+
+
The supported consequence types, sorted by impact:
splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
(based on biotype) which maps from transcript_id to a transcript. At
the same time also build the hash "gid2gene" which maps from gene_id to
gf_gene_t pointer.
-
+
2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
Use only features from "ftr" which are present in "id2tr".
3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
-
+
Data structures.
idx_cds, idx_utr, idx_exon, idx_tscript:
as described above, regidx structures for fast lookup of exons/transcripts
overlapping a region, the payload is a pointer to tscript.cds
*/
-
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define FLT_EXCLUDE 2
// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR 2
-#define N_SPLICE_REGION_EXON 3
-#define N_SPLICE_REGION_INTRON 8
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
#define N_REF_PAD 10 // number of bases to avoid boundary effects
// Node types in the haplotype tree
#define HAP_CDS 0
-#define HAP_ROOT 1
+#define HAP_ROOT 1
#define HAP_SSS 2 // start/stop/splice
#define CSQ_PRINTED_UPSTREAM (1<<0)
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
// see kput_vcsq()
-const char *csq_strings[] =
+const char *csq_strings[] =
{
- NULL,
- "synonymous",
- "missense",
- "stop_lost",
- "stop_gained",
- "inframe_deletion",
- "inframe_insertion",
- "frameshift",
- "splice_acceptor",
- "splice_donor",
- "start_lost",
- "splice_region",
- "stop_retained",
- "5_prime_utr",
- "3_prime_utr",
- "non_coding",
- "intron",
+ NULL,
+ "synonymous",
+ "missense",
+ "stop_lost",
+ "stop_gained",
+ "inframe_deletion",
+ "inframe_insertion",
+ "frameshift",
+ "splice_acceptor",
+ "splice_donor",
+ "start_lost",
+ "splice_region",
+ "stop_retained",
+ "5_prime_utr",
+ "3_prime_utr",
+ "non_coding",
+ "intron",
"intergenic",
"inframe_altering",
NULL,
// GFF line types
+#define GFF_UNKN_LINE 0
#define GFF_TSCRIPT_LINE 1
#define GFF_GENE_LINE 2
-/*
+/*
Genomic features, for fast lookup by position to overlapping features
*/
#define GF_coding_bit 6
/*
Helper structures, only for initialization
-
+
ftr_t
- temporary list of all exons, CDS, UTRs
+ temporary list of all exons, CDS, UTRs
*/
KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
-
+
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
hap_t *hap; // transcript haplotype recursion
#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
-static const char *gf_strings_noncoding[] =
-{
+static const char *gf_strings_noncoding[] =
+{
"MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
"antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
- "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
- "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
- "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
"transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
"translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
"LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
khash_str2int_destroy_free(tbl->str2id);
free(tbl->str);
}
-static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
+// returns 0 on success, -1 on failure
+static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
{
ss = strstr(ss,needle); // e.g. "ID=transcript:"
- if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ if ( !ss ) return -1;
ss += strlen(needle);
char *se = ss;
khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
-
- return id;
+ *id_ptr = id;
+ return 0;
}
static inline int gff_parse_type(char *line)
{
line += 8;
switch (*line)
{
- case 'p':
+ case 'p':
if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
case 't':
if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
int biotype = gff_parse_biotype(ss);
if ( biotype <= 0 )
{
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript: %s\n",line);
+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored transcript, unknown biotype: %s\n",line);
return;
}
// create a mapping from transcript_id to gene_id
- uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
- uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
+ uint32_t trid, gene_id;
+ if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
+ {
+ if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
+ error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
+ if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
+ {
+ if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
tr->id = trid;
int biotype = gff_parse_biotype(ss);
if ( biotype <= 0 )
{
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene: %s\n",line);
+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(stderr,"ignored gene, unknown biotype: %s\n",line);
return;
}
aux_t *aux = &args->init;
// substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
+ uint32_t gene_id;
+ if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
+ {
+ if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
+ error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
+
gf_gene_t *gene = gene_init(aux, gene_id);
assert( !gene->name ); // the gene_id should be unique
int gff_parse(args_t *args, char *line, ftr_t *ftr)
{
// - skip empty lines and commented lines
- // - columns
+ // - columns
// 1. chr
// 2. <skip>
// 3. CDS, transcript, gene, ...
else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
else
{
+ int type = GFF_UNKN_LINE;
+ if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
+ else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
ss = gff_skip(line, ss);
ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
ss = gff_skip(line, ss);
- int type = gff_parse_type(ss);
- if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
+ if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene:
+ if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
{
// we ignore these, debug print to see new types:
ss = strstr(ss,"ID=");
ss += 2;
// substring search for "Parent=transcript:ENST00000437963"
- ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
+ if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
+ {
+ if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
+
ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
return 0;
}
tscript_t *tr = tscript_init(aux, ftr->trid);
if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
+
gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
cds->tr = tr;
cds->beg = ftr->beg;
cds->len = ftr->end - ftr->beg + 1;
cds->icds = 0; // to keep valgrind on mac happy
cds->phase = ftr->phase;
-
+
hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
tr->cds[tr->ncds++] = cds;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
- len += tr->cds[i]->len;
+ len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
for (i=0; i<tr->ncds; i++)
{
tr->cds[i]->icds = i;
- len += tr->cds[i]->len;
+ len += tr->cds[i]->len;
if ( !i ) continue;
gf_cds_t *a = tr->cds[i-1];
gf_cds_t *b = tr->cds[i];
- if ( a->beg + a->len - 1 >= b->beg )
+ if ( a->beg + a->len - 1 >= b->beg )
{
if ( args->force )
{
}
else
error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
- " Use the --force option to override (at your own risk).\n",
+ " Use the --force option to override (at your own risk).\n",
args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
}
}
continue;
}
- // populate regidx by category:
+ // populate regidx by category:
// ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
// gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
if ( ftr->type==GF_CDS ) register_cds(args, ftr);
if ( args->verbosity > 0 )
{
- fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
regidx_nregs(args->idx_tscript),
regidx_nregs(args->idx_exon),
regidx_nregs(args->idx_cds),
regidx_nregs(args->idx_utr));
}
+ if ( !regidx_nregs(args->idx_tscript) )
+ fprintf(stderr,
+ "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+ " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+ " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
free(aux->ftr);
khash_str2int_destroy_free(aux->seq2int);
if ( args->sample_list && !strcmp("-",args->sample_list) )
{
// ignore all samples
- if ( args->output_type==FT_TAB_TEXT )
+ if ( args->output_type==FT_TAB_TEXT )
{
// significant speedup for plain VCFs
if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0)
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq, see http://samtools.github.io/bcftools/howtos/csq-calling.html for details. Format: Consequence|gene|transcript|biotype|strand|amino_acid_change|dna_change\">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
- if ( args->hdr_nsmpl )
+ if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
}
*/
#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
-#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
+#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
typedef struct
{
bcf1_t *rec;
} vcf;
uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
- check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
+ check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
check_donor:1, // as with check_acceptor
check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
- check_region_end:1, //
+ check_region_end:1, //
check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
set_refalt:1; // set kref,kalt, if set, check also for synonymous events
uint32_t csq;
int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
- uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
+ uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
ref_end; // a more conservative csq (the first and last base in kref.s)
kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
}
#define XDBG 0
#if XDBG
fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
-#endif
+#endif
splice->kref.l = 0;
splice->kalt.l = 0;
gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
tscript_t *tr = utr->tr;
if ( tr->id != trid ) continue;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type;
fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
#endif
if ( !type ) return;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = type;
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
- if ( ret!=0 )
+ if ( ret!=0 )
{
regitr_destroy(itr);
return SPLICE_OUTSIDE; // overlaps utr
while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced
}
- else
+ else
{
// STRAND_FWD
int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion
}
}
}
- if ( splice->ref_end >= ex_beg )
+ if ( splice->ref_end >= ex_beg )
{
splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
splice->ref_beg = ex_beg - 1;
}
}
}
- if ( splice->ref_beg < ex_end )
+ if ( splice->ref_beg < ex_end )
{
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
splice->ref_end = ex_end;
splice->vcf.rlen -= splice->tbeg + splice->tend;
splice->vcf.alen -= splice->tbeg + splice->tend;
}
- splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
- splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
{
splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
}
}
}
- if ( splice->ref_end >= ex_beg )
+ if ( splice->ref_end >= ex_beg )
{
splice->tbeg = splice->ref_beg - splice->vcf.pos;
splice->ref_beg = ex_beg;
}
}
}
- if ( splice->ref_beg <= ex_end )
+ if ( splice->ref_beg <= ex_end )
{
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
splice->ref_end = ex_end;
if ( splice->set_refalt )
{
splice->vcf.rlen -= splice->tbeg + splice->tend;
- splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
- splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
}
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
assert( parent->type!=HAP_SSS );
- if ( parent->type==HAP_CDS )
+ if ( parent->type==HAP_CDS )
{
i = parent->icds;
if ( i!=cds->icds )
/*
ref: spliced reference and its length (ref.l)
- seq: part of the spliced query transcript on the reference strand to translate, its
+ seq: part of the spliced query transcript on the reference strand to translate, its
length (seq.l) and the total length of the complete transcript (seq.m)
sbeg: seq offset within the spliced query transcript
rbeg: seq offset within ref, 0-based
else // STRAND_REV
{
// right padding - number of bases to take from ref
- npad = (seq.m - (sbeg + seq.l)) % 3;
+ npad = (seq.m - (sbeg + seq.l)) % 3;
#if DBG>1
fprintf(stderr," npad: %d\n",npad);
#endif
}
if ( seq.s-codon==2 )
{
- tmp[2] = seq.s[0];
+ tmp[2] = seq.s[0];
i = 1;
}
else if ( seq.s-codon==1 )
{
- tmp[1] = seq.s[0];
+ tmp[1] = seq.s[0];
tmp[2] = seq.s[1];
i = 0;
}
void tscript_splice_ref(tscript_t *tr)
{
int i, len = 0;
- for (i=0; i<tr->ncds; i++)
+ for (i=0; i<tr->ncds; i++)
len += tr->cds[i]->len;
tr->nsref = len + 2*N_REF_PAD;
vrec_t *vrec = vbuf->vrec[i];
// if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
- if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
+ if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
csq->type.type &= ~CSQ_SPLICE_REGION;
if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue;
if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop
- if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
+ if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
{
// This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
// can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
// consequences:
// stop_lost|AL627309.1|ENST00000423372|protein_coding|-
// stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
- if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
+ if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
{
if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
{
vrec->vcsq[i].type |= csq->type.type;
// remove stop_lost&synonymous if stop_retained set
- if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
+ if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
}
if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
}
- vrec->vcsq[i].type |= csq->type.type;
+ vrec->vcsq[i].type |= csq->type.type;
goto exit_duplicate;
}
}
{
if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
- if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
+ if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
{
vrec->vcsq[i].type |= csq->type.type;
goto exit_duplicate;
csq->type.biotype = tr->type;
// only now we see the translated sequence and can determine if the stop/start changes are real
- int rm_csq = 0;
+ int rm_csq = 0;
csq->type.type = 0;
for (i=ibeg; i<=iend; i++)
csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
}
if ( csq->type.type & CSQ_STOP_LOST )
{
- if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
{
rm_csq |= CSQ_STOP_LOST;
csq->type.type |= CSQ_STOP_RETAINED;
}
else
{
- for (i=0; i<hap->tref.l; i++)
- if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
- if ( i==hap->tref.l )
+ int aa_change = 0;
+ for (i=0; i<hap->tref.l; i++)
+ {
+ if ( hap->tref.s[i] == hap->tseq.s[i] ) continue;
+ aa_change = 1;
+ if ( hap->tref.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_LOST;
+ else if ( hap->tseq.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_GAINED;
+ else
+ csq->type.type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( !aa_change )
csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
- else if ( hap->tref.s[i] == '*' )
- csq->type.type |= CSQ_STOP_LOST;
- else if ( hap->tseq.s[i] == '*' )
- csq->type.type |= CSQ_STOP_GAINED;
- else
- csq->type.type |= CSQ_MISSENSE_VARIANT;
}
}
// Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
// The spliced sequence has been built for the current haplotype and stored
// in hap->sseq. Now we break it and output as independent parts
-
+
kstring_t sseq;
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
hap->upstream_stop = 0;
// check for duplicate records
i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
- if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
+ if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
{
// vcf record with a new pos
rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
vrec->line->pos = save_pos; // this is necessary for compound variants
continue;
}
-
+
args->str.l = 0;
kput_vcsq(args, &vrec->vcsq[0], &args->str);
for (j=1; j<vrec->nvcsq; j++)
int i = 0;
while ( ref[i] && vcf[i] )
{
- if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
+ if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
i++;
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
}
if ( csq_type & CSQ_STOP_LOST )
{
- if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
{
csq_type &= ~CSQ_STOP_LOST;
csq_type |= CSQ_STOP_RETAINED;
}
else
{
- for (j=0; j<tref->l; j++)
- if ( tref->s[j] != tseq->s[j] ) break;
- if ( j==tref->l )
+ int aa_change = 0;
+ for (j=0; j<tref->l; j++)
+ {
+ if ( tref->s[j] == tseq->s[j] ) continue;
+ aa_change = 1;
+ if ( tref->s[j] == '*' )
+ csq_type |= CSQ_STOP_LOST;
+ else if ( tseq->s[j] == '*' )
+ csq_type |= CSQ_STOP_GAINED;
+ else
+ csq_type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( !aa_change )
csq_type |= CSQ_SYNONYMOUS_VARIANT;
- else if ( tref->s[j] == '*' )
- csq_type |= CSQ_STOP_LOST;
- else if ( tseq->s[j] == '*' )
- csq_type |= CSQ_STOP_GAINED;
- else
- csq_type |= CSQ_MISSENSE_VARIANT;
}
if ( csq_type & CSQ_COMPOUND )
{
tr->root->ncsq_list++;
hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
- rm_csq->type.vstr = str;
+ rm_csq->type.vstr = str;
}
if ( csq_type & ~CSQ_COMPOUND )
{
fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
overlaps_warned = 1;
}
- if ( args->out )
+ if ( args->out )
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
}
if ( child->type==HAP_SSS )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
// apply the VCF variants and extend the haplotype tree
int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
ngts /= bcf_hdr_nsamples(args->hdr);
- if ( ngts!=1 && ngts!=2 )
+ if ( ngts!=1 && ngts!=2 )
{
if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) )
{
fprintf(stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
multiploid_warned = 1;
}
- if ( args->out )
+ if ( args->out )
fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
continue;
}
}
if ( child->type==HAP_SSS )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
splice.csq = 0;
int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
splice.csq = 0;
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
{
while ( regitr_overlap(args->itr) )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
tscript_t *tr = cds->tr;
{
while ( regitr_overlap(args->itr) )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
tscript_t *tr = utr->tr;
while ( regitr_overlap(args->itr) )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
splice.vcf.alt = rec->d.allele[1];
return;
}
- if ( args->rid != rec->rid )
+ if ( args->rid != rec->rid )
{
hap_flush(args, REGIDX_MAX);
vbuf_flush(args, REGIDX_MAX);
static const char *usage(void)
{
- return
+ return
"\n"
"About: Haplotype-aware consequence caller.\n"
"Usage: bcftools csq [OPTIONS] in.vcf\n"
" -g, --gff-annot FILE GFF3 annotation file\n"
"\n"
"CSQ options:\n"
- " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
" -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
" -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
{"no-version",no_argument,NULL,3},
{0,0,0,0}
};
- int c, targets_is_file = 0, regions_is_file = 0;
+ int c, targets_is_file = 0, regions_is_file = 0;
int regions_overlap = 1;
int targets_overlap = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
{
- switch (c)
+ switch (c)
{
case 1 : args->force = 1; break;
case 2 :
args->brief_predictions = 1;
fprintf(stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
break;
- case 'B':
+ case 'B':
args->brief_predictions = strtol(optarg,&tmp,10);
if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
- case 'v':
+ case 'v':
args->verbosity = atoi(optarg);
if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
break;
case 'p':
- switch (optarg[0])
+ switch (optarg[0])
{
case 'a': args->phase = PHASE_AS_IS; break;
case 'm': args->phase = PHASE_MERGE; break;
break;
case 'f': args->fa_fname = optarg; break;
case 'g': args->gff_fname = optarg; break;
- case 'n':
+ case 'n':
args->ncsq2_max = 2 * atoi(optarg);
if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
break;
/* The MIT License
- Copyright (c) 2016-2021 Genome Research Ltd.
+ Copyright (c) 2016-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
C .. corresponding CDS, exon, and UTR lines:
- C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
-
-
+
+
The supported consequence types, sorted by impact:
splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
(based on biotype) which maps from transcript_id to a transcript. At
the same time also build the hash "gid2gene" which maps from gene_id to
gf_gene_t pointer.
-
+
2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
Use only features from "ftr" which are present in "id2tr".
3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
-
+
Data structures.
idx_cds, idx_utr, idx_exon, idx_tscript:
as described above, regidx structures for fast lookup of exons/transcripts
overlapping a region, the payload is a pointer to tscript.cds
*/
-
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define FLT_EXCLUDE 2
// Definition of splice_region, splice_acceptor and splice_donor
-#define N_SPLICE_DONOR 2
-#define N_SPLICE_REGION_EXON 3
-#define N_SPLICE_REGION_INTRON 8
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
#define N_REF_PAD 10 // number of bases to avoid boundary effects
// Node types in the haplotype tree
#define HAP_CDS 0
-#define HAP_ROOT 1
+#define HAP_ROOT 1
#define HAP_SSS 2 // start/stop/splice
#define CSQ_PRINTED_UPSTREAM (1<<0)
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
// see kput_vcsq()
-const char *csq_strings[] =
+const char *csq_strings[] =
{
- NULL,
- "synonymous",
- "missense",
- "stop_lost",
- "stop_gained",
- "inframe_deletion",
- "inframe_insertion",
- "frameshift",
- "splice_acceptor",
- "splice_donor",
- "start_lost",
- "splice_region",
- "stop_retained",
- "5_prime_utr",
- "3_prime_utr",
- "non_coding",
- "intron",
+ NULL,
+ "synonymous",
+ "missense",
+ "stop_lost",
+ "stop_gained",
+ "inframe_deletion",
+ "inframe_insertion",
+ "frameshift",
+ "splice_acceptor",
+ "splice_donor",
+ "start_lost",
+ "splice_region",
+ "stop_retained",
+ "5_prime_utr",
+ "3_prime_utr",
+ "non_coding",
+ "intron",
"intergenic",
"inframe_altering",
NULL,
// GFF line types
+#define GFF_UNKN_LINE 0
#define GFF_TSCRIPT_LINE 1
#define GFF_GENE_LINE 2
-/*
+/*
Genomic features, for fast lookup by position to overlapping features
*/
#define GF_coding_bit 6
/*
Helper structures, only for initialization
-
+
ftr_t
- temporary list of all exons, CDS, UTRs
+ temporary list of all exons, CDS, UTRs
*/
KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
int ncsq2_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ (*2 and 1 bit skipped to avoid BCF missing values)
int ncsq2_small_warned;
int brief_predictions;
-
+
int rid; // current chromosome
tr_heap_t *active_tr; // heap of active transcripts for quick flushing
hap_t *hap; // transcript haplotype recursion
#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
-static const char *gf_strings_noncoding[] =
-{
+static const char *gf_strings_noncoding[] =
+{
"MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
"antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
- "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
- "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
- "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
"transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
"translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
"LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
khash_str2int_destroy_free(tbl->str2id);
free(tbl->str);
}
-static inline uint32_t gff_id_parse(id_tbl_t *tbl, const char *line, const char *needle, char *ss)
+// returns 0 on success, -1 on failure
+static inline int gff_id_parse(id_tbl_t *tbl, const char *needle, char *ss, uint32_t *id_ptr)
{
ss = strstr(ss,needle); // e.g. "ID=transcript:"
- if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ if ( !ss ) return -1;
ss += strlen(needle);
char *se = ss;
khash_str2int_set(tbl->str2id, tbl->str[id], id);
}
*se = tmp;
-
- return id;
+ *id_ptr = id;
+ return 0;
}
static inline int gff_parse_type(char *line)
{
line += 8;
switch (*line)
{
- case 'p':
+ case 'p':
if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
case 't':
if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
- else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
int biotype = gff_parse_biotype(ss);
if ( biotype <= 0 )
{
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript: %s\n",line);
+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored transcript, unknown biotype: %s\n",line);
return;
}
// create a mapping from transcript_id to gene_id
- uint32_t trid = gff_id_parse(&args->tscript_ids, line, "ID=transcript:", ss);
- uint32_t gene_id = gff_id_parse(&args->init.gene_ids, line, "Parent=gene:", ss);
+ uint32_t trid, gene_id;
+ if ( gff_id_parse(&args->tscript_ids, "ID=transcript:", ss, &trid) )
+ {
+ if ( gff_id_parse(&args->tscript_ids, "ID=", ss, &trid) )
+ error("[%s:%d %s] Could not parse the line, neither \"ID=transcript:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(bcftools_stderr,"Warning: non-standard transcript ID notation in the GFF, expected \"ID=transcript:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
+ if ( gff_id_parse(&args->init.gene_ids, "Parent=gene:", ss, &gene_id) )
+ {
+ if ( gff_id_parse(&args->init.gene_ids, "Parent=", ss, &gene_id) )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=gene:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(bcftools_stderr,"Warning: non-standard transcript Parent notation in the GFF, expected \"Parent=gene:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
tr->id = trid;
int biotype = gff_parse_biotype(ss);
if ( biotype <= 0 )
{
- if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene: %s\n",line);
+ if ( !gff_ignored_biotype(args, ss) && args->verbosity > 0 ) fprintf(bcftools_stderr,"ignored gene, unknown biotype: %s\n",line);
return;
}
aux_t *aux = &args->init;
// substring search for "ID=gene:ENSG00000437963"
- uint32_t gene_id = gff_id_parse(&aux->gene_ids, line, "ID=gene:", ss);
+ uint32_t gene_id;
+ if ( gff_id_parse(&aux->gene_ids, "ID=gene:", ss, &gene_id) )
+ {
+ if ( gff_id_parse(&aux->gene_ids, "ID=", ss, &gene_id) )
+ error("[%s:%d %s] Could not parse the line, neither \"ID=gene:\" nor \"ID=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(bcftools_stderr,"Warning: non-standard gene ID notation in the GFF, expected \"ID=gene:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
+
gf_gene_t *gene = gene_init(aux, gene_id);
assert( !gene->name ); // the gene_id should be unique
int gff_parse(args_t *args, char *line, ftr_t *ftr)
{
// - skip empty lines and commented lines
- // - columns
+ // - columns
// 1. chr
// 2. <skip>
// 3. CDS, transcript, gene, ...
else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
else
{
+ int type = GFF_UNKN_LINE;
+ if ( !strncmp("gene\t",ss,4) ) type = GFF_GENE_LINE;
+ else if ( !strncmp("transcript\t",ss,4) ) type = GFF_TSCRIPT_LINE;
ss = gff_skip(line, ss);
ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
ss = gff_skip(line, ss);
- int type = gff_parse_type(ss);
- if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
+ if ( type==GFF_UNKN_LINE ) type = gff_parse_type(ss); // determine type from ID=transcript: or ID=gene:
+ if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
{
// we ignore these, debug print to see new types:
ss = strstr(ss,"ID=");
ss += 2;
// substring search for "Parent=transcript:ENST00000437963"
- ftr->trid = gff_id_parse(&args->tscript_ids, line, "Parent=transcript:", ss);
+ if ( gff_id_parse(&args->tscript_ids, "Parent=transcript:", ss, &ftr->trid) )
+ {
+ if ( gff_id_parse(&args->tscript_ids, "Parent=", ss, &ftr->trid) )
+ error("[%s:%d %s] Could not parse the line, neither \"Parent=transcript:\" nor \"Parent=\" substring is present: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ static int warned = 0;
+ if ( !warned && args->verbosity > 0 )
+ {
+ fprintf(bcftools_stderr,"Warning: non-standard gene Parent notation in the GFF, expected \"Parent=transcript:XXX\", found %s\n",line);
+ warned = 1;
+ }
+ }
+
ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
return 0;
}
tscript_t *tr = tscript_init(aux, ftr->trid);
if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
-
+
gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
cds->tr = tr;
cds->beg = ftr->beg;
cds->len = ftr->end - ftr->beg + 1;
cds->icds = 0; // to keep valgrind on mac happy
cds->phase = ftr->phase;
-
+
hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
tr->cds[tr->ncds++] = cds;
}
error("Error: GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d). Use the --force option to proceed anyway (at your own risk).\n",
args->tscript_ids.str[tr->id],tr->cds[i]->beg+1,phase,len);
}
- len += tr->cds[i]->len;
+ len += tr->cds[i]->len;
}
if ( !tscript_ok ) continue; // skip this transcript
}
for (i=0; i<tr->ncds; i++)
{
tr->cds[i]->icds = i;
- len += tr->cds[i]->len;
+ len += tr->cds[i]->len;
if ( !i ) continue;
gf_cds_t *a = tr->cds[i-1];
gf_cds_t *b = tr->cds[i];
- if ( a->beg + a->len - 1 >= b->beg )
+ if ( a->beg + a->len - 1 >= b->beg )
{
if ( args->force )
{
}
else
error("Error: CDS overlap in the transcript %s: %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32", is this intended (e.g. ribosomal slippage)?\n"
- " Use the --force option to override (at your own risk).\n",
+ " Use the --force option to override (at your own risk).\n",
args->tscript_ids.str[tr->id], a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
}
}
continue;
}
- // populate regidx by category:
+ // populate regidx by category:
// ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
// gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
if ( ftr->type==GF_CDS ) register_cds(args, ftr);
if ( args->verbosity > 0 )
{
- fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ fprintf(bcftools_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
regidx_nregs(args->idx_tscript),
regidx_nregs(args->idx_exon),
regidx_nregs(args->idx_cds),
regidx_nregs(args->idx_utr));
}
+ if ( !regidx_nregs(args->idx_tscript) )
+ fprintf(bcftools_stderr,
+ "Warning: No usable transcripts found, likely a failure to parse a non-standard GFF file. Please check if the misc/gff2gff\n"
+ " or misc/gff2gff.py script can fix the problem (both do different things). See also the man page for the description\n"
+ " of the expected format http://samtools.github.io/bcftools/bcftools-man.html#csq\n");
free(aux->ftr);
khash_str2int_destroy_free(aux->seq2int);
if ( args->sample_list && !strcmp("-",args->sample_list) )
{
// ignore all samples
- if ( args->output_type==FT_TAB_TEXT )
+ if ( args->output_type==FT_TAB_TEXT )
{
// significant speedup for plain VCFs
if (bcf_hdr_set_samples(args->hdr,NULL,0) < 0)
hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->sr->p);
if ( args->record_cmd_line ) bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq, see http://samtools.github.io/bcftools/howtos/csq-calling.html for details. Format: Consequence|gene|transcript|biotype|strand|amino_acid_change|dna_change\">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
- if ( args->hdr_nsmpl )
+ if ( args->hdr_nsmpl )
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
}
*/
#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
-#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
+#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
typedef struct
{
bcf1_t *rec;
} vcf;
uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
- check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
+ check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
check_donor:1, // as with check_acceptor
check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
- check_region_end:1, //
+ check_region_end:1, //
check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
set_refalt:1; // set kref,kalt, if set, check also for synonymous events
uint32_t csq;
int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
- uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
+ uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
ref_end; // a more conservative csq (the first and last base in kref.s)
kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
}
#define XDBG 0
#if XDBG
fprintf(bcftools_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
-#endif
+#endif
splice->kref.l = 0;
splice->kalt.l = 0;
gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
tscript_t *tr = utr->tr;
if ( tr->id != trid ) continue;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = (utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3) | type;
fprintf(bcftools_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
#endif
if ( !type ) return;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = type;
if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
{
ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id, splice->csq, splice->vcf.ial);
- if ( ret!=0 )
+ if ( ret!=0 )
{
regitr_destroy(itr);
return SPLICE_OUTSIDE; // overlaps utr
while ( ptr_vcf[i] && ptr_vcf[i]==ptr_ref[i] ) i++;
if ( ptr_vcf[i] ) return 0; // the deleted sequence cannot be replaced
}
- else
+ else
{
// STRAND_FWD
int32_t vcf_block_beg = splice->vcf.pos + ref_len - 2*ndel; // the position of the first base of the ref block that could potentially replace the deletion
}
}
}
- if ( splice->ref_end >= ex_beg )
+ if ( splice->ref_end >= ex_beg )
{
splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
splice->ref_beg = ex_beg - 1;
}
}
}
- if ( splice->ref_beg < ex_end )
+ if ( splice->ref_beg < ex_end )
{
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
splice->ref_end = ex_end;
splice->vcf.rlen -= splice->tbeg + splice->tend;
splice->vcf.alen -= splice->tbeg + splice->tend;
}
- splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
- splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
{
splice->csq |= (splice->ref_end - splice->ref_beg)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
}
}
}
- if ( splice->ref_end >= ex_beg )
+ if ( splice->ref_end >= ex_beg )
{
splice->tbeg = splice->ref_beg - splice->vcf.pos;
splice->ref_beg = ex_beg;
}
}
}
- if ( splice->ref_beg <= ex_end )
+ if ( splice->ref_beg <= ex_end )
{
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
splice->ref_end = ex_end;
if ( splice->set_refalt )
{
splice->vcf.rlen -= splice->tbeg + splice->tend;
- splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
- splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
}
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
return SPLICE_INSIDE;
}
assert( parent->type!=HAP_SSS );
- if ( parent->type==HAP_CDS )
+ if ( parent->type==HAP_CDS )
{
i = parent->icds;
if ( i!=cds->icds )
/*
ref: spliced reference and its length (ref.l)
- seq: part of the spliced query transcript on the reference strand to translate, its
+ seq: part of the spliced query transcript on the reference strand to translate, its
length (seq.l) and the total length of the complete transcript (seq.m)
sbeg: seq offset within the spliced query transcript
rbeg: seq offset within ref, 0-based
else // STRAND_REV
{
// right padding - number of bases to take from ref
- npad = (seq.m - (sbeg + seq.l)) % 3;
+ npad = (seq.m - (sbeg + seq.l)) % 3;
#if DBG>1
fprintf(bcftools_stderr," npad: %d\n",npad);
#endif
}
if ( seq.s-codon==2 )
{
- tmp[2] = seq.s[0];
+ tmp[2] = seq.s[0];
i = 1;
}
else if ( seq.s-codon==1 )
{
- tmp[1] = seq.s[0];
+ tmp[1] = seq.s[0];
tmp[2] = seq.s[1];
i = 0;
}
void tscript_splice_ref(tscript_t *tr)
{
int i, len = 0;
- for (i=0; i<tr->ncds; i++)
+ for (i=0; i<tr->ncds; i++)
len += tr->cds[i]->len;
tr->nsref = len + 2*N_REF_PAD;
vrec_t *vrec = vbuf->vrec[i];
// if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
- if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
+ if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
csq->type.type &= ~CSQ_SPLICE_REGION;
if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
if ( csq->type.vcf_ial != vrec->vcsq[i].vcf_ial ) continue;
if ( (csq->type.type&CSQ_UPSTREAM_STOP)^(vrec->vcsq[i].type&CSQ_UPSTREAM_STOP) ) continue; // both must or mustn't have upstream_stop
- if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
+ if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
{
// This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
// can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
// consequences:
// stop_lost|AL627309.1|ENST00000423372|protein_coding|-
// stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
- if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
+ if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
{
if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
{
vrec->vcsq[i].type |= csq->type.type;
// remove stop_lost&synonymous if stop_retained set
- if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
+ if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
}
if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
}
- vrec->vcsq[i].type |= csq->type.type;
+ vrec->vcsq[i].type |= csq->type.type;
goto exit_duplicate;
}
}
{
if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
- if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
+ if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
{
vrec->vcsq[i].type |= csq->type.type;
goto exit_duplicate;
csq->type.biotype = tr->type;
// only now we see the translated sequence and can determine if the stop/start changes are real
- int rm_csq = 0;
+ int rm_csq = 0;
csq->type.type = 0;
for (i=ibeg; i<=iend; i++)
csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
}
if ( csq->type.type & CSQ_STOP_LOST )
{
- if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
{
rm_csq |= CSQ_STOP_LOST;
csq->type.type |= CSQ_STOP_RETAINED;
}
else
{
- for (i=0; i<hap->tref.l; i++)
- if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
- if ( i==hap->tref.l )
+ int aa_change = 0;
+ for (i=0; i<hap->tref.l; i++)
+ {
+ if ( hap->tref.s[i] == hap->tseq.s[i] ) continue;
+ aa_change = 1;
+ if ( hap->tref.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_LOST;
+ else if ( hap->tseq.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_GAINED;
+ else
+ csq->type.type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( !aa_change )
csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
- else if ( hap->tref.s[i] == '*' )
- csq->type.type |= CSQ_STOP_LOST;
- else if ( hap->tseq.s[i] == '*' )
- csq->type.type |= CSQ_STOP_GAINED;
- else
- csq->type.type |= CSQ_MISSENSE_VARIANT;
}
}
// Check if compound inframe variants are real inframes, or if the stop codon occurs before the frameshift can be restored
// The spliced sequence has been built for the current haplotype and stored
// in hap->sseq. Now we break it and output as independent parts
-
+
kstring_t sseq;
sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
hap->upstream_stop = 0;
// check for duplicate records
i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
- if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
+ if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
{
// vcf record with a new pos
rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
vrec->line->pos = save_pos; // this is necessary for compound variants
continue;
}
-
+
args->str.l = 0;
kput_vcsq(args, &vrec->vcsq[0], &args->str);
for (j=1; j<vrec->nvcsq; j++)
int i = 0;
while ( ref[i] && vcf[i] )
{
- if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
+ if ( ref[i]!=vcf[i] && toupper(ref[i])!=toupper(vcf[i]) )
error("Error: the fasta reference does not match the VCF REF allele at %s:%"PRId64" .. fasta=%c vcf=%c\n",
bcf_seqname(args->hdr,rec),(int64_t) rec->pos+vbeg+1,ref[i],vcf[i]);
i++;
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
}
if ( csq_type & CSQ_STOP_LOST )
{
- if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
{
csq_type &= ~CSQ_STOP_LOST;
csq_type |= CSQ_STOP_RETAINED;
}
else
{
- for (j=0; j<tref->l; j++)
- if ( tref->s[j] != tseq->s[j] ) break;
- if ( j==tref->l )
+ int aa_change = 0;
+ for (j=0; j<tref->l; j++)
+ {
+ if ( tref->s[j] == tseq->s[j] ) continue;
+ aa_change = 1;
+ if ( tref->s[j] == '*' )
+ csq_type |= CSQ_STOP_LOST;
+ else if ( tseq->s[j] == '*' )
+ csq_type |= CSQ_STOP_GAINED;
+ else
+ csq_type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( !aa_change )
csq_type |= CSQ_SYNONYMOUS_VARIANT;
- else if ( tref->s[j] == '*' )
- csq_type |= CSQ_STOP_LOST;
- else if ( tseq->s[j] == '*' )
- csq_type |= CSQ_STOP_GAINED;
- else
- csq_type |= CSQ_MISSENSE_VARIANT;
}
if ( csq_type & CSQ_COMPOUND )
{
tr->root->ncsq_list++;
hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
- rm_csq->type.vstr = str;
+ rm_csq->type.vstr = str;
}
if ( csq_type & ~CSQ_COMPOUND )
{
fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
overlaps_warned = 1;
}
- if ( args->out )
+ if ( args->out )
fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
}
else ret = 1; // prevent reporting as intron in test_tscript
}
if ( child->type==HAP_SSS )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
// apply the VCF variants and extend the haplotype tree
int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
ngts /= bcf_hdr_nsamples(args->hdr);
- if ( ngts!=1 && ngts!=2 )
+ if ( ngts!=1 && ngts!=2 )
{
if ( args->verbosity && (!multiploid_warned || args->verbosity > 1) )
{
fprintf(bcftools_stderr," This message is printed only once, the verbosity can be increased with `--verbose 2`\n");
multiploid_warned = 1;
}
- if ( args->out )
+ if ( args->out )
fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%"PRId64"\t%s>%s\n", chr,(int64_t) rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
continue;
}
}
if ( child->type==HAP_SSS )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.biotype = tr->type;
splice.csq = 0;
int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
splice.csq = 0;
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
csq.pos = rec->pos;
csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
{
while ( regitr_overlap(args->itr) )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
tscript_t *tr = cds->tr;
{
while ( regitr_overlap(args->itr) )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
tscript_t *tr = utr->tr;
while ( regitr_overlap(args->itr) )
{
- csq_t csq;
+ csq_t csq;
memset(&csq, 0, sizeof(csq_t));
tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
splice.vcf.alt = rec->d.allele[1];
return;
}
- if ( args->rid != rec->rid )
+ if ( args->rid != rec->rid )
{
hap_flush(args, REGIDX_MAX);
vbuf_flush(args, REGIDX_MAX);
static const char *usage(void)
{
- return
+ return
"\n"
"About: Haplotype-aware consequence caller.\n"
"Usage: bcftools csq [OPTIONS] in.vcf\n"
" -g, --gff-annot FILE GFF3 annotation file\n"
"\n"
"CSQ options:\n"
- " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
+ " -B, --trim-protein-seq INT Abbreviate protein-changing predictions to max INT aminoacids\n"
" -c, --custom-tag STRING Use this tag instead of the default BCSQ\n"
" -l, --local-csq Localized predictions, consider only one VCF record at a time\n"
" -n, --ncsq INT Maximum number of per-haplotype consequences to consider for each site [15]\n"
{"no-version",no_argument,NULL,3},
{0,0,0,0}
};
- int c, targets_is_file = 0, regions_is_file = 0;
+ int c, targets_is_file = 0, regions_is_file = 0;
int regions_overlap = 1;
int targets_overlap = 0;
char *targets_list = NULL, *regions_list = NULL, *tmp;
while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:bB:v:",loptions,NULL)) >= 0)
{
- switch (c)
+ switch (c)
{
case 1 : args->force = 1; break;
case 2 :
args->brief_predictions = 1;
fprintf(bcftools_stderr,"Warning: the -b option will be removed in future versions. Please use -B 1 instead.\n");
break;
- case 'B':
+ case 'B':
args->brief_predictions = strtol(optarg,&tmp,10);
if ( *tmp || args->brief_predictions<1 ) error("Could not parse argument: --trim-protein-seq %s\n", optarg);
break;
case 'l': args->local_csq = 1; break;
case 'c': args->bcsq_tag = optarg; break;
case 'q': error("Error: the -q option has been deprecated, use -v, --verbose instead.\n"); break;
- case 'v':
+ case 'v':
args->verbosity = atoi(optarg);
if ( args->verbosity<0 || args->verbosity>2 ) error("Error: expected integer 0-2 with -v, --verbose\n");
break;
case 'p':
- switch (optarg[0])
+ switch (optarg[0])
{
case 'a': args->phase = PHASE_AS_IS; break;
case 'm': args->phase = PHASE_MERGE; break;
break;
case 'f': args->fa_fname = optarg; break;
case 'g': args->gff_fname = optarg; break;
- case 'n':
+ case 'n':
args->ncsq2_max = 2 * atoi(optarg);
if ( args->ncsq2_max <= 0 ) error("Expected positive integer with -n, got %s\n", optarg);
break;
/* filter.c -- filter expressions.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types
int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
+ // -3: select indices on the fly based on values in GT
int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
float *tmpf;
kstring_t tmps;
int max_unpack, mtmpi, mtmpf, nsamples;
+ struct {
+ bcf1_t *line;
+ int32_t *buf, nbuf, mbuf; // GTs as obtained by bcf_get_genotypes()
+ uint64_t *mask; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles
+ } cached_GT;
#if ENABLE_PERL_FILTERS
PerlInterpreter *perl;
#endif
+ char **undef_tag;
+ int nundef_tag;
+ int status, exit_on_error;
};
return TOK_VAL;
}
+#define FILTER_OK 0
+#define FILTER_ERR_UNKN_TAGS 1
+#define FILTER_ERR_OTHER 2
+
+static void filter_add_undef_tag(filter_t *filter, char *str)
+{
+ int i;
+ for (i=0; i<filter->nundef_tag; i++)
+ if ( !strcmp(str,filter->undef_tag[i]) ) break;
+ if ( i<filter->nundef_tag ) return;
+ filter->nundef_tag++;
+ filter->undef_tag = (char**)realloc(filter->undef_tag,sizeof(*filter->undef_tag)*filter->nundef_tag);
+ if ( !filter->undef_tag ) error("Could not allocate memory\n");
+ filter->undef_tag[filter->nundef_tag-1] = strdup(str);
+ if ( !filter->undef_tag[filter->nundef_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_undef_tags(filter_t *filter, int *ntags)
+{
+ *ntags = filter->nundef_tag;
+ return (const char**)filter->undef_tag;
+}
+
/*
Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
return strdup(path);
}
+static int filters_cache_genotypes(filter_t *flt, bcf1_t *line)
+{
+ if ( flt->cached_GT.line==line ) return flt->cached_GT.nbuf > 0 ? 0 : -1;
+ flt->cached_GT.line = line;
+ flt->cached_GT.nbuf = bcf_get_genotypes(flt->hdr, line, &flt->cached_GT.buf, &flt->cached_GT.mbuf);
+ if ( flt->cached_GT.nbuf<=0 ) return -1;
+ if ( !flt->cached_GT.mask )
+ {
+ flt->cached_GT.mask = (uint64_t*) malloc(sizeof(*flt->cached_GT.mask)*flt->nsamples);
+ if ( !flt->cached_GT.mask ) error("Could not alloc %zu bytes\n",sizeof(*flt->cached_GT.mask)*flt->nsamples);
+ }
+ int i,j, ngt1 = flt->cached_GT.nbuf / line->n_sample;
+ for (i=0; i<line->n_sample; i++)
+ {
+ int32_t *ptr = flt->cached_GT.buf + i*ngt1;
+ flt->cached_GT.mask[i] = 0;
+ for (j=0; j<ngt1; j++)
+ {
+ if ( bcf_gt_is_missing(ptr[j]) ) continue;
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ int allele = bcf_gt_allele(ptr[j]);
+ if ( allele > 63 )
+ {
+ static int warned = 0;
+ if ( !warned )
+ {
+ fprintf(stderr,"Too many alleles, skipping GT filtering at this site %s:%"PRId64". "
+ "(This warning is printed only once.)\n", bcf_seqname(flt->hdr,line),line->pos+1);
+ warned = 1;
+ }
+ flt->cached_GT.nbuf = 0;
+ return -1;
+ }
+ flt->cached_GT.mask[i] |= 1<<allele;
+ }
+ }
+ return 0;
+}
static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->values[i] = ptr[tok->idx];
}
}
+ else if ( tok->idx==-3 )
+ {
+ if ( filters_cache_genotypes(flt,line)!=0 )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ int32_t *src = flt->tmpi + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<nsrc1; k++) // source values are AD[0..nsrc1]
+ {
+ if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
+ dst[j++] = src[k];
+ }
+ for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
+ }
+ }
else
{
int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
tok->values[i] = ptr[tok->idx];
}
}
+ else if ( tok->idx==-3 )
+ {
+ if ( filters_cache_genotypes(flt,line)!=0 )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ float *src = flt->tmpf + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<nsrc1; k++) // source values are AF[0..nsrc1]
+ {
+ if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
+ if ( bcf_float_is_missing(src[k]) )
+ bcf_double_set_missing(dst[j]);
+ else if ( bcf_float_is_vector_end(src[k]) )
+ bcf_double_set_vector_end(dst[j]);
+ else
+ dst[j] = src[k];
+ j++;
+ }
+ for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
+ }
+ }
else
{
int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
tok->str_value.s[tok->str_value.l] = 0;
tok->nval1 = nvals1;
}
-static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
-static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
-static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
+static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } // rr, ra, aa, aA etc
+static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } // hap, hom, het
+static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } // mis, alt, ref
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
{ \
tok_init_values(atok, btok, rtok); \
tok_init_samples(atok, btok, rtok); \
- if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
+ if ( !atok->nsamples && !btok->nsamples ) \
{ \
- assert( atok->nsamples==btok->nsamples ); \
- for (i=0; i<atok->nvalues; i++) \
+ if ( atok->nvalues!=btok->nvalues && atok->nvalues!=1 && btok->nvalues!=1 ) \
+ error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nvalues,btok->nvalues); \
+ int ir,ia = 0, ib = 0; \
+ for (ir=0; ir<rtok->nvalues; ir++) \
{ \
- if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
+ if ( atok->nvalues > 1 ) ia = ir; \
+ if ( btok->nvalues > 1 ) ib = ir; \
+ if ( bcf_double_is_missing_or_vector_end(atok->values[ia]) || bcf_double_is_missing_or_vector_end(btok->values[ib]) ) \
{ \
- bcf_double_set_missing(rtok->values[i]); \
+ bcf_double_set_missing(rtok->values[ir]); \
continue; \
} \
has_values = 1; \
- rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \
+ rtok->values[ir] = TYPE atok->values[ia] AOP TYPE btok->values[ib]; \
+ } \
+ } \
+ else if ( atok->nsamples && btok->nsamples ) \
+ { \
+ assert( atok->nsamples==btok->nsamples ); \
+ if ( atok->nval1!=btok->nval1 && atok->nval1!=1 && btok->nval1!=1 ) \
+ error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nval1,btok->nval1); \
+ for (i=0; i<rtok->nsamples; i++) \
+ { \
+ double *rval = rtok->values + i*rtok->nval1; \
+ double *aval = atok->values + i*atok->nval1; \
+ double *bval = btok->values + i*btok->nval1; \
+ int ir,ia = 0, ib = 0; \
+ for (ir=0; ir<rtok->nval1; ir++) \
+ { \
+ if ( atok->nval1 > 1 ) ia = ir; \
+ if ( btok->nval1 > 1 ) ib = ir; \
+ if ( bcf_double_is_missing_or_vector_end(aval[ia]) || bcf_double_is_missing_or_vector_end(bval[ib]) ) \
+ { \
+ bcf_double_set_missing(rval[ir]); \
+ continue; \
+ } \
+ has_values = 1; \
+ rval[ir] = TYPE aval[ia] AOP TYPE bval[ib]; \
+ } \
} \
} \
else if ( atok->nsamples ) \
*idx = -2;
return 0;
}
+ if ( !strcmp("GT", tag_idx) )
+ {
+ *idxs = (int*) malloc(sizeof(int));
+ (*idxs)[0] = -1;
+ *nidxs = 1;
+ *idx = -3;
+ return 0;
+ }
// TAG[integer] .. one field; idx positive
char *end, *beg = tag_idx;
tok->idxs = (int*) malloc(sizeof(int));
tok->idxs[0] = -1;
tok->nidxs = 1;
- tok->idx = -2;
+ tok->idx = idx1;
}
else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
tok->usmpl[idx1] = 1;
}
- else if ( idx1==-2 )
+ else if ( idx1==-2 || idx1==-3 )
{
for (i=0; i<nidxs1; i++)
{
if ( is_fmt==-1 ) is_fmt = 0;
}
if ( is_array )
+ {
parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
+ if ( tok->idx==-3 && bcf_hdr_id2length(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=BCF_VL_R )
+ error("Error: GT subscripts can be used only with Number=R tags\n");
+ }
else if ( is_fmt && !tok->nsamples )
{
int i;
{
errno = 0;
tok->threshold = strtod(tmp.s, &end); // float?
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ if ( errno!=0 || end!=tmp.s+len )
+ {
+ if ( filter->exit_on_error )
+ error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ filter->status |= FILTER_ERR_UNKN_TAGS;
+ filter_add_undef_tag(filter,tmp.s);
+ }
}
tok->is_constant = 1;
return 0;
}
-
static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
{
int i;
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
-filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
+static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error)
{
filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
filter->str = strdup(str);
filter->hdr = hdr;
filter->max_unpack |= BCF_UN_STR;
+ filter->exit_on_error = exit_on_error;
int nops = 0, mops = 0; // operators stack
int nout = 0, mout = 0; // filter tokens, RPN
filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
return filter;
}
+filter_t *filter_parse(bcf_hdr_t *hdr, const char *str)
+{
+ return filter_init_(hdr, str, 0);
+}
+filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
+{
+ return filter_init_(hdr, str, 1);
+}
void filter_destroy(filter_t *filter)
{
free(filter->filters[i].regex);
}
}
+ for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+ free(filter->undef_tag);
+ free(filter->cached_GT.buf);
+ free(filter->cached_GT.mask);
free(filter->filters);
free(filter->flt_stack);
free(filter->str);
int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
{
+ if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n");
bcf_unpack(line, filter->max_unpack);
int i, nstack = 0;
}
}
+int filter_status(filter_t *filter)
+{
+ return filter->status;
+}
+
/* filter.c -- filter expressions.
- Copyright (C) 2013-2022 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types
int idx; // 0-based index to VCF vectors,
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
+ // -3: select indices on the fly based on values in GT
int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
float *tmpf;
kstring_t tmps;
int max_unpack, mtmpi, mtmpf, nsamples;
+ struct {
+ bcf1_t *line;
+ int32_t *buf, nbuf, mbuf; // GTs as obtained by bcf_get_genotypes()
+ uint64_t *mask; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles
+ } cached_GT;
#if ENABLE_PERL_FILTERS
PerlInterpreter *perl;
#endif
+ char **undef_tag;
+ int nundef_tag;
+ int status, exit_on_error;
};
return TOK_VAL;
}
+#define FILTER_OK 0
+#define FILTER_ERR_UNKN_TAGS 1
+#define FILTER_ERR_OTHER 2
+
+static void filter_add_undef_tag(filter_t *filter, char *str)
+{
+ int i;
+ for (i=0; i<filter->nundef_tag; i++)
+ if ( !strcmp(str,filter->undef_tag[i]) ) break;
+ if ( i<filter->nundef_tag ) return;
+ filter->nundef_tag++;
+ filter->undef_tag = (char**)realloc(filter->undef_tag,sizeof(*filter->undef_tag)*filter->nundef_tag);
+ if ( !filter->undef_tag ) error("Could not allocate memory\n");
+ filter->undef_tag[filter->nundef_tag-1] = strdup(str);
+ if ( !filter->undef_tag[filter->nundef_tag-1] ) error("Could not allocate memory\n");
+}
+const char **filter_list_undef_tags(filter_t *filter, int *ntags)
+{
+ *ntags = filter->nundef_tag;
+ return (const char**)filter->undef_tag;
+}
+
/*
Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
return strdup(path);
}
+static int filters_cache_genotypes(filter_t *flt, bcf1_t *line)
+{
+ if ( flt->cached_GT.line==line ) return flt->cached_GT.nbuf > 0 ? 0 : -1;
+ flt->cached_GT.line = line;
+ flt->cached_GT.nbuf = bcf_get_genotypes(flt->hdr, line, &flt->cached_GT.buf, &flt->cached_GT.mbuf);
+ if ( flt->cached_GT.nbuf<=0 ) return -1;
+ if ( !flt->cached_GT.mask )
+ {
+ flt->cached_GT.mask = (uint64_t*) malloc(sizeof(*flt->cached_GT.mask)*flt->nsamples);
+ if ( !flt->cached_GT.mask ) error("Could not alloc %zu bytes\n",sizeof(*flt->cached_GT.mask)*flt->nsamples);
+ }
+ int i,j, ngt1 = flt->cached_GT.nbuf / line->n_sample;
+ for (i=0; i<line->n_sample; i++)
+ {
+ int32_t *ptr = flt->cached_GT.buf + i*ngt1;
+ flt->cached_GT.mask[i] = 0;
+ for (j=0; j<ngt1; j++)
+ {
+ if ( bcf_gt_is_missing(ptr[j]) ) continue;
+ if ( ptr[j]==bcf_int32_vector_end ) break;
+ int allele = bcf_gt_allele(ptr[j]);
+ if ( allele > 63 )
+ {
+ static int warned = 0;
+ if ( !warned )
+ {
+ fprintf(bcftools_stderr,"Too many alleles, skipping GT filtering at this site %s:%"PRId64". "
+ "(This warning is printed only once.)\n", bcf_seqname(flt->hdr,line),line->pos+1);
+ warned = 1;
+ }
+ flt->cached_GT.nbuf = 0;
+ return -1;
+ }
+ flt->cached_GT.mask[i] |= 1<<allele;
+ }
+ }
+ return 0;
+}
static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->values[i] = ptr[tok->idx];
}
}
+ else if ( tok->idx==-3 )
+ {
+ if ( filters_cache_genotypes(flt,line)!=0 )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ int32_t *src = flt->tmpi + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<nsrc1; k++) // source values are AD[0..nsrc1]
+ {
+ if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
+ dst[j++] = src[k];
+ }
+ for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
+ }
+ }
else
{
int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
tok->values[i] = ptr[tok->idx];
}
}
+ else if ( tok->idx==-3 )
+ {
+ if ( filters_cache_genotypes(flt,line)!=0 )
+ {
+ tok->nvalues = 0;
+ return;
+ }
+ for (i=0; i<tok->nsamples; i++)
+ {
+ if ( !tok->usmpl[i] ) continue;
+ float *src = flt->tmpf + i*nsrc1;
+ double *dst = tok->values + i*tok->nval1;
+ int k, j = 0;
+ for (k=0; k<nsrc1; k++) // source values are AF[0..nsrc1]
+ {
+ if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
+ if ( bcf_float_is_missing(src[k]) )
+ bcf_double_set_missing(dst[j]);
+ else if ( bcf_float_is_vector_end(src[k]) )
+ bcf_double_set_vector_end(dst[j]);
+ else
+ dst[j] = src[k];
+ j++;
+ }
+ for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
+ }
+ }
else
{
int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
tok->str_value.s[tok->str_value.l] = 0;
tok->nval1 = nvals1;
}
-static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
-static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
-static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
+static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } // rr, ra, aa, aA etc
+static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } // hap, hom, het
+static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } // mis, alt, ref
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
{
{ \
tok_init_values(atok, btok, rtok); \
tok_init_samples(atok, btok, rtok); \
- if ( (atok->nsamples && btok->nsamples) || (!atok->nsamples && !btok->nsamples)) \
+ if ( !atok->nsamples && !btok->nsamples ) \
{ \
- assert( atok->nsamples==btok->nsamples ); \
- for (i=0; i<atok->nvalues; i++) \
+ if ( atok->nvalues!=btok->nvalues && atok->nvalues!=1 && btok->nvalues!=1 ) \
+ error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nvalues,btok->nvalues); \
+ int ir,ia = 0, ib = 0; \
+ for (ir=0; ir<rtok->nvalues; ir++) \
{ \
- if ( bcf_double_is_missing_or_vector_end(atok->values[i]) || bcf_double_is_missing_or_vector_end(btok->values[i]) ) \
+ if ( atok->nvalues > 1 ) ia = ir; \
+ if ( btok->nvalues > 1 ) ib = ir; \
+ if ( bcf_double_is_missing_or_vector_end(atok->values[ia]) || bcf_double_is_missing_or_vector_end(btok->values[ib]) ) \
{ \
- bcf_double_set_missing(rtok->values[i]); \
+ bcf_double_set_missing(rtok->values[ir]); \
continue; \
} \
has_values = 1; \
- rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \
+ rtok->values[ir] = TYPE atok->values[ia] AOP TYPE btok->values[ib]; \
+ } \
+ } \
+ else if ( atok->nsamples && btok->nsamples ) \
+ { \
+ assert( atok->nsamples==btok->nsamples ); \
+ if ( atok->nval1!=btok->nval1 && atok->nval1!=1 && btok->nval1!=1 ) \
+ error("Cannot run numeric operator in -i/-e filtering on vectors of different lengths: %d vs %d\n",atok->nval1,btok->nval1); \
+ for (i=0; i<rtok->nsamples; i++) \
+ { \
+ double *rval = rtok->values + i*rtok->nval1; \
+ double *aval = atok->values + i*atok->nval1; \
+ double *bval = btok->values + i*btok->nval1; \
+ int ir,ia = 0, ib = 0; \
+ for (ir=0; ir<rtok->nval1; ir++) \
+ { \
+ if ( atok->nval1 > 1 ) ia = ir; \
+ if ( btok->nval1 > 1 ) ib = ir; \
+ if ( bcf_double_is_missing_or_vector_end(aval[ia]) || bcf_double_is_missing_or_vector_end(bval[ib]) ) \
+ { \
+ bcf_double_set_missing(rval[ir]); \
+ continue; \
+ } \
+ has_values = 1; \
+ rval[ir] = TYPE aval[ia] AOP TYPE bval[ib]; \
+ } \
} \
} \
else if ( atok->nsamples ) \
*idx = -2;
return 0;
}
+ if ( !strcmp("GT", tag_idx) )
+ {
+ *idxs = (int*) malloc(sizeof(int));
+ (*idxs)[0] = -1;
+ *nidxs = 1;
+ *idx = -3;
+ return 0;
+ }
// TAG[integer] .. one field; idx positive
char *end, *beg = tag_idx;
tok->idxs = (int*) malloc(sizeof(int));
tok->idxs[0] = -1;
tok->nidxs = 1;
- tok->idx = -2;
+ tok->idx = idx1;
}
else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
tok->usmpl[idx1] = 1;
}
- else if ( idx1==-2 )
+ else if ( idx1==-2 || idx1==-3 )
{
for (i=0; i<nidxs1; i++)
{
if ( is_fmt==-1 ) is_fmt = 0;
}
if ( is_array )
+ {
parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
+ if ( tok->idx==-3 && bcf_hdr_id2length(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=BCF_VL_R )
+ error("Error: GT subscripts can be used only with Number=R tags\n");
+ }
else if ( is_fmt && !tok->nsamples )
{
int i;
{
errno = 0;
tok->threshold = strtod(tmp.s, &end); // float?
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ if ( errno!=0 || end!=tmp.s+len )
+ {
+ if ( filter->exit_on_error )
+ error("[%s:%d %s] Error: the tag \"%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ filter->status |= FILTER_ERR_UNKN_TAGS;
+ filter_add_undef_tag(filter,tmp.s);
+ }
}
tok->is_constant = 1;
return 0;
}
-
static void filter_debug_print(token_t *toks, token_t **tok_ptrs, int ntoks)
{
int i;
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
-filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
+static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error)
{
filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
filter->str = strdup(str);
filter->hdr = hdr;
filter->max_unpack |= BCF_UN_STR;
+ filter->exit_on_error = exit_on_error;
int nops = 0, mops = 0; // operators stack
int nout = 0, mout = 0; // filter tokens, RPN
filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
return filter;
}
+filter_t *filter_parse(bcf_hdr_t *hdr, const char *str)
+{
+ return filter_init_(hdr, str, 0);
+}
+filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
+{
+ return filter_init_(hdr, str, 1);
+}
void filter_destroy(filter_t *filter)
{
free(filter->filters[i].regex);
}
}
+ for (i=0; i<filter->nundef_tag; i++) free(filter->undef_tag[i]);
+ free(filter->undef_tag);
+ free(filter->cached_GT.buf);
+ free(filter->cached_GT.mask);
free(filter->filters);
free(filter->flt_stack);
free(filter->str);
int filter_test(filter_t *filter, bcf1_t *line, const uint8_t **samples)
{
+ if ( filter->status != FILTER_OK ) error("Error: the caller did not check the filter status\n");
bcf_unpack(line, filter->max_unpack);
int i, nstack = 0;
}
}
+int filter_status(filter_t *filter)
+{
+ return filter->status;
+}
+
/* filter.h -- filter expressions.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
/**
* @hdr: BCF header file
* @str: see the bcftools filter command help for description
+ *
+ * Same as filter_parse() but exits on errors
*/
filter_t *filter_init(bcf_hdr_t *hdr, const char *str);
void filter_expression_info(FILE *fp);
int filter_max_unpack(filter_t *filter);
+/**
+ * Same as filter_init() but may not exit on some type of errors. The caller
+ * must check if the returned value is not NULL and if the consequent call
+ * of filter_status() returns FILTER_OK before the filter_pass() can be called.
+ */
+filter_t *filter_parse(bcf_hdr_t *hdr, const char *str);
+
+#define FILTER_OK 0
+#define FILTER_ERR_UNKN_TAGS 1
+#define FILTER_ERR_OTHER 2
+
+/**
+ * Check if filter_parse() was successful
+ */
+int filter_status(filter_t *filter);
+const char **filter_list_undef_tags(filter_t *filter, int *nundef);
+
#endif
if (argc < 2) { usage(stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ printf("bcftools %s\nUsing htslib %s\nCopyright (C) 2023 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
printf("License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
if (argc < 2) { usage(bcftools_stderr); return 1; }
if (strcmp(argv[1], "version") == 0 || strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-v") == 0) {
- fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2022 Genome Research Ltd.\n", bcftools_version(), hts_version());
+ fprintf(bcftools_stdout, "bcftools %s\nUsing htslib %s\nCopyright (C) 2023 Genome Research Ltd.\n", bcftools_version(), hts_version());
#if USE_GPL
fprintf(bcftools_stdout, "License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>\n");
#else
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
while ( *ptr && isspace(*ptr) ) ptr++;
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
*tmp = 0;
- int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
+ int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
if ( ismpl<0 ) continue;
if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
if ( !khash_str2int_has_key(grp2idx,ptr+1) )
{
if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
int igrp = smpl2grp[i] - 1;
- if ( !call->smpl_grp[igrp].nsmpl )
+ if ( !call->smpl_grp[igrp].nsmpl )
call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
call->smpl_grp[igrp].nsmpl++;
static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
{
int ia, ib, i;
- int ngts_ori = nals_ori*(nals_ori+1)/2;
+ int ngts_ori = nals_ori*(nals_ori+1)/2;
int ngts_new = call->nals_new*(call->nals_new+1)/2;
int nsmpl = grp->nsmpl;
static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
assert( call->tgt_als->n );
- if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n);
hts_expand(char*,call->tgt_als->n+1,call->nals,call->als);
+ hts_expand(int,call->tgt_als->n+1,call->nals_map,call->als_map);
+ hts_expand(int,(call->tgt_als->n+1)*(call->tgt_als->n+2)/2,call->npl_map,call->pl_map);
int has_new = 0;
{
call->als[nals] = call->tgt_als->allele[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);
-
- // if ( j+1==*unseen )
- // {
- // fprintf(stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen);
- // int k;
- // for (k=0; k<rec->n_allele; k++) fprintf(stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
- // fprintf(stderr,"\tTAB=");
- // for (k=0; k<call->tgt_als->n; k++) fprintf(stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
- // fprintf(stderr,"\n");
- // return -1;
- // }
-
if ( j>=0 )
{
// existing allele
bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
if ( nals_ori > 8*sizeof(call->als_new) )
- {
+ {
fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
- return 0;
+ return 0;
}
// For each group find the best combination of alleles
for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
- {
+ {
fprintf(stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
- return 0;
+ return 0;
}
if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
{
anno16_t a;
float tmpf[4];
int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
- if ( is_tested )
+ if ( is_tested )
{
for (i=0; i<4; i++) tmpf[i] = a.p[i];
bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
while ( *ptr && isspace(*ptr) ) ptr++;
if ( !*ptr ) error("Could not parse the line in %s, expected a sample name followed by tab and a population name: %s\n",call->sample_groups,lines[i]);
*tmp = 0;
- int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
+ int ismpl = bcf_hdr_id2int(call->hdr, BCF_DT_SAMPLE, lines[i]);
if ( ismpl<0 ) continue;
if ( smpl2grp[ismpl] ) error("Error: the sample \"%s\" is listed twice in %s\n", lines[i],call->sample_groups);
if ( !khash_str2int_has_key(grp2idx,ptr+1) )
{
if ( !smpl2grp[i] ) error("Error: The sample \"%s\" is not listed in %s\n",call->hdr->samples[i],call->sample_groups);
int igrp = smpl2grp[i] - 1;
- if ( !call->smpl_grp[igrp].nsmpl )
+ if ( !call->smpl_grp[igrp].nsmpl )
call->smpl_grp[igrp].smpl = (uint32_t*)calloc(grp2n[igrp],sizeof(uint32_t));
call->smpl_grp[igrp].smpl[call->smpl_grp[igrp].nsmpl] = i;
call->smpl_grp[igrp].nsmpl++;
static void mcall_call_genotypes(call_t *call, int nals_ori, smpl_grp_t *grp)
{
int ia, ib, i;
- int ngts_ori = nals_ori*(nals_ori+1)/2;
+ int ngts_ori = nals_ori*(nals_ori+1)/2;
int ngts_new = call->nals_new*(call->nals_new+1)/2;
int nsmpl = grp->nsmpl;
static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
assert( call->tgt_als->n );
- if ( call->tgt_als->n>5 ) error("Maximum accepted number of alleles is 5, got %d\n", call->tgt_als->n);
hts_expand(char*,call->tgt_als->n+1,call->nals,call->als);
+ hts_expand(int,call->tgt_als->n+1,call->nals_map,call->als_map);
+ hts_expand(int,(call->tgt_als->n+1)*(call->tgt_als->n+2)/2,call->npl_map,call->pl_map);
int has_new = 0;
{
call->als[nals] = call->tgt_als->allele[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, call->tgt_als->allele[i]);
-
- // if ( j+1==*unseen )
- // {
- // fprintf(bcftools_stderr,"Fixme? Cannot constrain to %d-th allele (%s); j=%d,unseen=%d. VCF=",i,call->tgt_als->allele[i],j,*unseen);
- // int k;
- // for (k=0; k<rec->n_allele; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",rec->d.allele[k]);
- // fprintf(bcftools_stderr,"\tTAB=");
- // for (k=0; k<call->tgt_als->n; k++) fprintf(bcftools_stderr,"%s%s",k==0?"":",",call->tgt_als->allele[k]);
- // fprintf(bcftools_stderr,"\n");
- // return -1;
- // }
-
if ( j>=0 )
{
// existing allele
bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
if ( nals_ori > 8*sizeof(call->als_new) )
- {
+ {
fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
- return 0;
+ return 0;
}
// For each group find the best combination of alleles
for (i=0; i<call->nals_new; i++) call->ac[i] = 0;
if ( call->flag & CALL_CONSTR_TRIO && call->nals_new>4 )
- {
+ {
fprintf(bcftools_stderr,"Too many alleles at %s:%"PRId64", skipping.\n", bcf_seqname(call->hdr,rec),(int64_t) rec->pos+1);
- return 0;
+ return 0;
}
if ( call->output_tags & (CALL_FMT_GQ|CALL_FMT_GP) )
{
anno16_t a;
float tmpf[4];
int is_tested = test16(call->anno16, &a) >= 0 && a.is_tested ? 1 : 0;
- if ( is_tested )
+ if ( is_tested )
{
for (i=0; i<4; i++) tmpf[i] = a.p[i];
bcf_update_info_float(call->hdr, rec, "PV4", tmpf, 4);
// Data shared by all bam files
typedef struct {
int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
- max_indel_depth, max_read_len, fmt_flag, ambig_reads;
+ max_indel_depth, max_read_len, ambig_reads;
+ uint32_t fmt_flag;
int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
double min_frac; // for indels
bcf1_t *bcf_rec;
htsFile *bcf_fp;
bcf_hdr_t *bcf_hdr;
+ int indels_v20;
int argc;
char **argv;
} mplp_conf_t;
// We cache sample information here so we don't have to keep recomputing this
// on each and every pileup column. If FMT/SCR annotation is requested, a flag
// is set to indicate the presence of a soft clip.
-//
-// Cd is an arbitrary block of data we can write into, which ends up in
-// the pileup structures. We stash the sample ID there:
-// has_soft_clip .. cd->i & 1
-// sample_id .. cd->i >> 1
static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
{
+ cd->p = calloc(1,sizeof(plp_cd_t));
+
+ PLP_NM(cd) = PLP_NM_UNSET;
+
mplp_aux_t *ma = (mplp_aux_t *)data;
int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
- cd->i = 0;
- PLP_SET_SAMPLE_ID(cd->i, n);
+ PLP_SET_SAMPLE_ID(cd, n);
+
// Whether read has a soft-clip is used in mplp_realn's heuristics.
// TODO: consider whether clip length is beneficial to use?
int i;
for (i=0; i<b->core.n_cigar; i++) {
int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
if (cig == BAM_CSOFT_CLIP) {
- PLP_SET_SOFT_CLIP(cd->i);
+ PLP_SET_SOFT_CLIP(cd);
break;
}
}
// Possible further optimsation, check tot_ins==1 later
// (and remove break) so we can detect single bp indels.
// We may want to focus BAQ on more complex regions only.
- PLP_SET_INDEL(cd->i);
+ PLP_SET_INDEL(cd);
break;
}
return 0;
}
+static int pileup_destructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
+{
+ free(cd->p);
+ return 0;
+}
static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
{
for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
{
const bam_pileup1_t *p = plp[i] + j;
- int id = PLP_SAMPLE_ID(p->cd.i);
+ int id = PLP_SAMPLE_ID(&(p->cd));
if (m->n_plp[id] == m->m_plp[id])
{
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
nt += n_plp[i];
for (j = 0; j < n_plp[i]; j++) { // iterate over reads
bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
- has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
+ has_indel += (PLP_HAS_INDEL(&p->cd) || p->indel) ? 1 : 0;
// Has_clip is almost always true for very long reads
// (eg PacBio CCS), but these rarely matter as the clip
// is likely a long way from this indel.
- has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0;
+ has_clip += (PLP_HAS_SOFT_CLIP(&p->cd)) ? 1 : 0;
if (max_indel < p->indel)
max_indel = p->indel;
if (min_indel > p->indel)
// We could use our own structure (p->cd.p), allocated during
// the constructor, but for simplicity we play dirty and
// abuse an unused flag bit instead.
- if (b->core.flag & 32768)
- continue;
- b->core.flag |= 32768;
+ if ( PLP_IS_REALN(&(p->cd)) ) continue;
+ PLP_SET_REALN(&(p->cd));
if (b->core.l_qseq > max_read_len)
continue;
}
int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
if (has_ref && (conf->flag & MPLP_REALN))
- mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
- conf->max_read_len, ref, ref_len, pos);
+ mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, conf->max_read_len, ref, ref_len, pos);
int total_depth, _ref0, ref16;
for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
conf->bc.tid = tid; conf->bc.pos = pos;
bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
bcf_clear1(conf->bcf_rec);
- bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
- conf->bca, 0);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, 0);
flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
// check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
- if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
- && (bcf_callaux_clean(conf->bca, &conf->bc),
- bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
+ if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth )
{
- for (i = 0; i < conf->gplp->n; ++i)
- bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
- if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL;
+ int iret;
+ if ( conf->indels_v20 )
+ iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
+ else
+ iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
+ if ( iret>=0 )
{
- bcf_clear1(conf->bcf_rec);
- bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
- flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ {
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ }
}
}
}
bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">");
+ if ( conf->fmt_flag&B2B_INFO_IDV )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">");
+ if ( conf->fmt_flag&B2B_INFO_IMF )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
if ( conf->fmt_flag&B2B_INFO_VDB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
- if (conf->fmt_flag & B2B_INFO_ZSCORE) {
- if ( conf->fmt_flag&B2B_INFO_RPB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_RPBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_MQBZ )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_BQBZ )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_MQSBZ )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
- if ( conf->fmt_flag&B2B_FMT_NMBZ )
- bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
- if ( conf->fmt_flag&B2B_INFO_SCB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
- } else {
- if ( conf->fmt_flag&B2B_INFO_RPB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
- }
-
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
-#if CDF_MWU_TESTS
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
-#endif
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_MIN_PL_SUM )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MIN_PL_SUM,Number=1,Type=Integer,Description=\"Sum of min PLs across all samples before normalization (experimental)\">");
+ if ( conf->fmt_flag&B2B_INFO_NM )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NM,Number=2,Type=Float,Description=\"Average number of mismatches in ref and alt reads (approximate, experimental, make me localized?)\">");
+ if ( conf->fmt_flag&B2B_INFO_NMBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better; approximate, experimental, make me localized?)\">");
+ if ( conf->fmt_flag&B2B_FMT_NMBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_SCBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_FS )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Fisher's exact test P-value to detect strand bias\">");
+ if ( conf->fmt_flag&B2B_INFO_SGB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric, http://samtools.github.io/bcftools/rd-SegBias.pdf\">");
+ if ( conf->fmt_flag&B2B_INFO_MQ0F )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
conf->bca->fmt_flag = conf->fmt_flag;
conf->bca->ambig_reads = conf->ambig_reads;
conf->bca->indel_win_size = conf->indel_win_size;
+ conf->bca->indels_v20 = conf->indels_v20;
conf->bc.bcf_hdr = conf->bcf_hdr;
conf->bc.n = nsmpl;
conf->max_indel_depth = conf->max_indel_depth * nsmpl;
conf->bcf_rec = bcf_init1();
bam_mplp_constructor(conf->iter, pileup_constructor);
+ bam_mplp_destructor(conf->iter, pileup_destructor);
+
// Run mpileup for multiple regions
if ( nregs )
}
#undef MAX_PATH_LEN
-int parse_format_flag(const char *str)
+#define SET_FMT_FLAG(str,bit,msg) \
+ if (!strcasecmp(tag,str) || !strcasecmp(tag,"FMT/"str) || !strcasecmp(tag,"FORMAT/"str)) \
+ { \
+ if ( *msg ) fprintf(stderr,"%s",msg); \
+ if ( exclude ) \
+ *flag &= ~bit; \
+ else \
+ *flag |= bit; \
+ free(tags[i]); \
+ continue; \
+ }
+#define SET_INFO_FLAG(str,bit,msg) if (!strcasecmp(tag,"INFO/"str)) \
+ { \
+ if ( exclude ) \
+ *flag &= ~bit; \
+ else \
+ *flag |= bit; \
+ free(tags[i]); \
+ continue; \
+ }
+
+void parse_format_flag(uint32_t *flag, const char *str)
{
- int i, flag = 0, n_tags;
+ int i, n_tags;
char **tags = hts_readlist(str, 0, &n_tags);
for(i=0; i<n_tags; i++)
{
- if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
- else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
- else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
- else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
- else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
- else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
- else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
- else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
- else if ( !strcasecmp(tags[i],"NMBZ") || !strcasecmp(tags[i],"FORMAT/NMBZ") || !strcasecmp(tags[i],"FMT/NMBZ") ) flag |= B2B_FMT_NMBZ;
- else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
- else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
- else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
- else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
- else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
- else
- {
- fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
- exit(EXIT_FAILURE);
- }
- free(tags[i]);
+ int exclude = tags[i][0]=='-' ? 1 : 0;
+ char *tag = exclude ? tags[i]+1 : tags[i];
+ SET_FMT_FLAG("AD", B2B_FMT_AD, "");
+ SET_FMT_FLAG("ADF", B2B_FMT_ADF, "");
+ SET_FMT_FLAG("ADR", B2B_FMT_ADR, "");
+ SET_FMT_FLAG("DP", B2B_FMT_DP, "");
+ SET_FMT_FLAG("DP4", B2B_FMT_DP4, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n");
+ SET_FMT_FLAG("DPR", B2B_FMT_DPR, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n");
+ SET_FMT_FLAG("DV", B2B_FMT_DV, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n");
+ SET_FMT_FLAG("NMBZ", B2B_FMT_NMBZ, "");
+ SET_FMT_FLAG("QS", B2B_FMT_QS, "");
+ SET_FMT_FLAG("SP", B2B_FMT_SP, "");
+ SET_FMT_FLAG("SCR", B2B_FMT_SCR, "");
+ SET_INFO_FLAG("DPR", B2B_INFO_DPR, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n");
+ SET_INFO_FLAG("AD", B2B_INFO_AD, "");
+ SET_INFO_FLAG("ADF", B2B_INFO_ADF, "");
+ SET_INFO_FLAG("ADR", B2B_INFO_ADR, "");
+ SET_INFO_FLAG("BQBZ", B2B_INFO_BQBZ, "");
+ SET_INFO_FLAG("FS", B2B_INFO_FS, "");
+ SET_INFO_FLAG("IDV", B2B_INFO_IDV, "");
+ SET_INFO_FLAG("IMF", B2B_INFO_IMF, "");
+ SET_INFO_FLAG("MIN_PL_SUM", B2B_INFO_MIN_PL_SUM, "");
+ SET_INFO_FLAG("MQ0F", B2B_INFO_MQ0F, "");
+ SET_INFO_FLAG("MQBZ", B2B_INFO_MQBZ, "");
+ SET_INFO_FLAG("NM", B2B_INFO_NM, "");
+ SET_INFO_FLAG("NMBZ", B2B_INFO_NMBZ, "");
+ SET_INFO_FLAG("RPBZ", B2B_INFO_RPBZ, "");
+ SET_INFO_FLAG("SCBZ", B2B_INFO_SCBZ, "");
+ SET_INFO_FLAG("SCR", B2B_INFO_SCR, "");
+ SET_INFO_FLAG("SGB", B2B_INFO_SGB, "");
+ SET_INFO_FLAG("VDB", B2B_INFO_VDB, "");
+ fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tag, str);
+ exit(EXIT_FAILURE);
}
if (n_tags) free(tags);
- return flag;
}
// todo: make it possible to turn off some annotations or change the defaults,
static void list_annotations(FILE *fp)
{
fprintf(fp,
-"\n"
-"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
-"\n"
-" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
-" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
-" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
-" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
-" FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
-" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
-" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
-" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
-"\n"
-"INFO annotation tags available:\n"
-"\n"
-" INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
-" INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
-" INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
-" INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
-"\n");
+ "Annotations added by default are in this list prefixed with \"*\". To suppress their output, run with\n"
+ "e.g. \"-a -FORMAT/AD\".\n"
+ "\n"
+ "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
+ "\n"
+ " FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+ " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+ " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+ " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+ " FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
+ " FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
+ " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+ " FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
+ "\n"
+ "INFO annotation tags available:\n"
+ "\n"
+ " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
+ " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
+ " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+ "* INFO/BQBZ .. Mann-Whitney U test of Base Quality Bias (Number=1,Type=Float)\n"
+ " INFO/FS .. Fisher's exact test P-value to detect strand bias (Number=1,Type=Float)\n"
+ "* INFO/IDV .. Maximum number of raw reads supporting an indel (Number=1,Type=Integer)\n"
+ "* INFO/IMF .. Maximum fraction of raw reads supporting an indel (Number=1,Type=Float)\n"
+ " INFO/MIN_PL_SUM\n"
+ " .. Sum of min PL across all samples before normalization, experimental (Number=1,Type=Integer)\n"
+ "* INFO/MQ0F .. Fraction of reads with zero mapping quality (Number=1,Type=Float)\n"
+ "* INFO/MQBZ .. Mann-Whitney U test of Mapping Quality Bias (Number=1,Type=Float)\n"
+ "* INFO/MQSBZ .. Mann-Whitney U-z test of Mapping Quality vs Strand Bias (Number=1,Type=Float)\n"
+ " INFO/NM .. Approximate average number of mismatches in ref and alt reads, experimental (Number=2,Type=Float)\n"
+ " INFO/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
+ "* INFO/RPBZ .. Mann-Whitney U test of Read Position Bias (Number=1,Type=Float)\n"
+ "* INFO/SCBZ .. Mann-Whitney U-z test of Soft-Clip Length Bias (Number=1,Type=Float)\n"
+ " INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
+ "* INFO/SGB .. Segregation based metric, http://samtools.github.io/bcftools/rd-SegBias.pdf (Number=1,Type=Float)\n"
+ "* INFO/VDB .. Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (Number=1,Type=Float)\n"
+ "\n");
}
static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -o, --output FILE Write output to FILE [standard output]\n"
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
- " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n"
" --threads INT Use multithreading with INT worker threads [0]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
fprintf(fp,
" --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
+ fprintf(fp,
+ " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n");
fprintf(fp,"\n");
fprintf(fp,
"Configuration profiles activated with -X, --config:\n"
mplp.n_threads = 0;
mplp.bsmpl = bam_smpl_init();
// the default to be changed in future, see also parse_format_flag()
- mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
+ mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB;
mplp.max_read_len = 500;
mplp.ambig_reads = B2B_DROP;
mplp.indel_win_size = 110;
{"gap-frac", required_argument, NULL, 'F'},
{"indel-bias", required_argument, NULL, 10},
{"indel-size", required_argument, NULL, 15},
+ {"indels-2.0", no_argument, NULL, 20},
{"tandem-qual", required_argument, NULL, 'h'},
{"skip-indels", no_argument, NULL, 'I'},
{"max-idepth", required_argument, NULL, 'L'},
{"platforms", required_argument, NULL, 'P'},
{"max-read-len", required_argument, NULL, 'M'},
{"config", required_argument, NULL, 'X'},
- {"mwu-u", no_argument, NULL, 'U'},
{"seed", required_argument, NULL, 13},
{"ambig-reads", required_argument, NULL, 14},
{"ar", required_argument, NULL, 14},
}
}
break;
+ case 20: mplp.indels_v20 = 1; break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
list_annotations(stderr);
return 1;
}
- mplp.fmt_flag |= parse_format_flag(optarg);
+ parse_format_flag(&mplp.fmt_flag,optarg);
break;
case 'M': mplp.max_read_len = atoi(optarg); break;
- case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
case 'X':
if (strcasecmp(optarg, "pacbio-ccs") == 0) {
mplp.min_frac = 0.1;
// Data shared by all bam files
typedef struct {
int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
- max_indel_depth, max_read_len, fmt_flag, ambig_reads;
+ max_indel_depth, max_read_len, ambig_reads;
+ uint32_t fmt_flag;
int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
double min_frac; // for indels
bcf1_t *bcf_rec;
htsFile *bcf_fp;
bcf_hdr_t *bcf_hdr;
+ int indels_v20;
int argc;
char **argv;
} mplp_conf_t;
// We cache sample information here so we don't have to keep recomputing this
// on each and every pileup column. If FMT/SCR annotation is requested, a flag
// is set to indicate the presence of a soft clip.
-//
-// Cd is an arbitrary block of data we can write into, which ends up in
-// the pileup structures. We stash the sample ID there:
-// has_soft_clip .. cd->i & 1
-// sample_id .. cd->i >> 1
static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
{
+ cd->p = calloc(1,sizeof(plp_cd_t));
+
+ PLP_NM(cd) = PLP_NM_UNSET;
+
mplp_aux_t *ma = (mplp_aux_t *)data;
int n = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
- cd->i = 0;
- PLP_SET_SAMPLE_ID(cd->i, n);
+ PLP_SET_SAMPLE_ID(cd, n);
+
// Whether read has a soft-clip is used in mplp_realn's heuristics.
// TODO: consider whether clip length is beneficial to use?
int i;
for (i=0; i<b->core.n_cigar; i++) {
int cig = bam_get_cigar(b)[i] & BAM_CIGAR_MASK;
if (cig == BAM_CSOFT_CLIP) {
- PLP_SET_SOFT_CLIP(cd->i);
+ PLP_SET_SOFT_CLIP(cd);
break;
}
}
// Possible further optimsation, check tot_ins==1 later
// (and remove break) so we can detect single bp indels.
// We may want to focus BAQ on more complex regions only.
- PLP_SET_INDEL(cd->i);
+ PLP_SET_INDEL(cd);
break;
}
return 0;
}
+static int pileup_destructor(void *data, const bam1_t *b, bam_pileup_cd *cd)
+{
+ free(cd->p);
+ return 0;
+}
static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
{
for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
{
const bam_pileup1_t *p = plp[i] + j;
- int id = PLP_SAMPLE_ID(p->cd.i);
+ int id = PLP_SAMPLE_ID(&(p->cd));
if (m->n_plp[id] == m->m_plp[id])
{
m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
nt += n_plp[i];
for (j = 0; j < n_plp[i]; j++) { // iterate over reads
bam_pileup1_t *p = (bam_pileup1_t *)plp[i] + j;
- has_indel += (PLP_HAS_INDEL(p->cd.i) || p->indel) ? 1 : 0;
+ has_indel += (PLP_HAS_INDEL(&p->cd) || p->indel) ? 1 : 0;
// Has_clip is almost always true for very long reads
// (eg PacBio CCS), but these rarely matter as the clip
// is likely a long way from this indel.
- has_clip += (PLP_HAS_SOFT_CLIP(p->cd.i)) ? 1 : 0;
+ has_clip += (PLP_HAS_SOFT_CLIP(&p->cd)) ? 1 : 0;
if (max_indel < p->indel)
max_indel = p->indel;
if (min_indel > p->indel)
// We could use our own structure (p->cd.p), allocated during
// the constructor, but for simplicity we play dirty and
// abuse an unused flag bit instead.
- if (b->core.flag & 32768)
- continue;
- b->core.flag |= 32768;
+ if ( PLP_IS_REALN(&(p->cd)) ) continue;
+ PLP_SET_REALN(&(p->cd));
if (b->core.l_qseq > max_read_len)
continue;
}
int has_ref = mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
if (has_ref && (conf->flag & MPLP_REALN))
- mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag,
- conf->max_read_len, ref, ref_len, pos);
+ mplp_realn(conf->nfiles, conf->n_plp, conf->plp, conf->flag, conf->max_read_len, ref, ref_len, pos);
int total_depth, _ref0, ref16;
for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
conf->bc.tid = tid; conf->bc.pos = pos;
bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
bcf_clear1(conf->bcf_rec);
- bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag,
- conf->bca, 0);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, 0);
flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
// call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
// check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
- if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
- && (bcf_callaux_clean(conf->bca, &conf->bc),
- bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0))
+ if ( !(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth )
{
- for (i = 0; i < conf->gplp->n; ++i)
- bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
- if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ conf->bca->chr = tid>=0 ? hdr->target_name[tid] : NULL;
+ int iret;
+ if ( conf->indels_v20 )
+ iret = bcf_iaux_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
+ else
+ iret = bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref);
+ if ( iret>=0 )
{
- bcf_clear1(conf->bcf_rec);
- bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
- flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ {
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ }
}
}
}
bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">");
+ if ( conf->fmt_flag&B2B_INFO_IDV )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">");
+ if ( conf->fmt_flag&B2B_INFO_IMF )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
if ( conf->fmt_flag&B2B_INFO_VDB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
- if (conf->fmt_flag & B2B_INFO_ZSCORE) {
- if ( conf->fmt_flag&B2B_INFO_RPB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_RPBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Read Position Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_MQBZ )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_BQBZ )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_MQSBZ )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
- if ( conf->fmt_flag&B2B_FMT_NMBZ )
- bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
- if ( conf->fmt_flag&B2B_INFO_SCB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
- } else {
- if ( conf->fmt_flag&B2B_INFO_RPB )
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
- }
-
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Phred-scaled p-value using Fisher's exact test to detect strand bias\">");
-#if CDF_MWU_TESTS
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
-#endif
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
- bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_MIN_PL_SUM )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MIN_PL_SUM,Number=1,Type=Integer,Description=\"Sum of min PLs across all samples before normalization (experimental)\">");
+ if ( conf->fmt_flag&B2B_INFO_NM )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NM,Number=2,Type=Float,Description=\"Average number of mismatches in ref and alt reads (approximate, experimental, make me localized?)\">");
+ if ( conf->fmt_flag&B2B_INFO_NMBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better; approximate, experimental, make me localized?)\">");
+ if ( conf->fmt_flag&B2B_FMT_NMBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_SCBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_INFO_FS )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=FS,Number=1,Type=Float,Description=\"Fisher's exact test P-value to detect strand bias\">");
+ if ( conf->fmt_flag&B2B_INFO_SGB )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric, http://samtools.github.io/bcftools/rd-SegBias.pdf\">");
+ if ( conf->fmt_flag&B2B_INFO_MQ0F )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
conf->bca->fmt_flag = conf->fmt_flag;
conf->bca->ambig_reads = conf->ambig_reads;
conf->bca->indel_win_size = conf->indel_win_size;
+ conf->bca->indels_v20 = conf->indels_v20;
conf->bc.bcf_hdr = conf->bcf_hdr;
conf->bc.n = nsmpl;
conf->max_indel_depth = conf->max_indel_depth * nsmpl;
conf->bcf_rec = bcf_init1();
bam_mplp_constructor(conf->iter, pileup_constructor);
+ bam_mplp_destructor(conf->iter, pileup_destructor);
+
// Run mpileup for multiple regions
if ( nregs )
}
#undef MAX_PATH_LEN
-int parse_format_flag(const char *str)
+#define SET_FMT_FLAG(str,bit,msg) \
+ if (!strcasecmp(tag,str) || !strcasecmp(tag,"FMT/"str) || !strcasecmp(tag,"FORMAT/"str)) \
+ { \
+ if ( *msg ) fprintf(bcftools_stderr,"%s",msg); \
+ if ( exclude ) \
+ *flag &= ~bit; \
+ else \
+ *flag |= bit; \
+ free(tags[i]); \
+ continue; \
+ }
+#define SET_INFO_FLAG(str,bit,msg) if (!strcasecmp(tag,"INFO/"str)) \
+ { \
+ if ( exclude ) \
+ *flag &= ~bit; \
+ else \
+ *flag |= bit; \
+ free(tags[i]); \
+ continue; \
+ }
+
+void parse_format_flag(uint32_t *flag, const char *str)
{
- int i, flag = 0, n_tags;
+ int i, n_tags;
char **tags = hts_readlist(str, 0, &n_tags);
for(i=0; i<n_tags; i++)
{
- if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
- else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(bcftools_stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
- else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(bcftools_stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
- else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(bcftools_stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(bcftools_stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
- else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
- else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
- else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
- else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
- else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
- else if ( !strcasecmp(tags[i],"NMBZ") || !strcasecmp(tags[i],"FORMAT/NMBZ") || !strcasecmp(tags[i],"FMT/NMBZ") ) flag |= B2B_FMT_NMBZ;
- else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
- else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
- else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
- else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
- else if ( !strcasecmp(tags[i],"SCB") || !strcasecmp(tags[i],"INFO/SCB")) flag |= B2B_INFO_SCB;
- else
- {
- fprintf(bcftools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
- bcftools_exit(EXIT_FAILURE);
- }
- free(tags[i]);
+ int exclude = tags[i][0]=='-' ? 1 : 0;
+ char *tag = exclude ? tags[i]+1 : tags[i];
+ SET_FMT_FLAG("AD", B2B_FMT_AD, "");
+ SET_FMT_FLAG("ADF", B2B_FMT_ADF, "");
+ SET_FMT_FLAG("ADR", B2B_FMT_ADR, "");
+ SET_FMT_FLAG("DP", B2B_FMT_DP, "");
+ SET_FMT_FLAG("DP4", B2B_FMT_DP4, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n");
+ SET_FMT_FLAG("DPR", B2B_FMT_DPR, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n");
+ SET_FMT_FLAG("DV", B2B_FMT_DV, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n");
+ SET_FMT_FLAG("NMBZ", B2B_FMT_NMBZ, "");
+ SET_FMT_FLAG("QS", B2B_FMT_QS, "");
+ SET_FMT_FLAG("SP", B2B_FMT_SP, "");
+ SET_FMT_FLAG("SCR", B2B_FMT_SCR, "");
+ SET_INFO_FLAG("DPR", B2B_INFO_DPR, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n");
+ SET_INFO_FLAG("AD", B2B_INFO_AD, "");
+ SET_INFO_FLAG("ADF", B2B_INFO_ADF, "");
+ SET_INFO_FLAG("ADR", B2B_INFO_ADR, "");
+ SET_INFO_FLAG("BQBZ", B2B_INFO_BQBZ, "");
+ SET_INFO_FLAG("FS", B2B_INFO_FS, "");
+ SET_INFO_FLAG("IDV", B2B_INFO_IDV, "");
+ SET_INFO_FLAG("IMF", B2B_INFO_IMF, "");
+ SET_INFO_FLAG("MIN_PL_SUM", B2B_INFO_MIN_PL_SUM, "");
+ SET_INFO_FLAG("MQ0F", B2B_INFO_MQ0F, "");
+ SET_INFO_FLAG("MQBZ", B2B_INFO_MQBZ, "");
+ SET_INFO_FLAG("NM", B2B_INFO_NM, "");
+ SET_INFO_FLAG("NMBZ", B2B_INFO_NMBZ, "");
+ SET_INFO_FLAG("RPBZ", B2B_INFO_RPBZ, "");
+ SET_INFO_FLAG("SCBZ", B2B_INFO_SCBZ, "");
+ SET_INFO_FLAG("SCR", B2B_INFO_SCR, "");
+ SET_INFO_FLAG("SGB", B2B_INFO_SGB, "");
+ SET_INFO_FLAG("VDB", B2B_INFO_VDB, "");
+ fprintf(bcftools_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tag, str);
+ bcftools_exit(EXIT_FAILURE);
}
if (n_tags) free(tags);
- return flag;
}
// todo: make it possible to turn off some annotations or change the defaults,
static void list_annotations(FILE *fp)
{
fprintf(fp,
-"\n"
-"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
-"\n"
-" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
-" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
-" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
-" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
-" FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
-" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
-" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
-" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
-"\n"
-"INFO annotation tags available:\n"
-"\n"
-" INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
-" INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
-" INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
-" INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
-"\n");
+ "Annotations added by default are in this list prefixed with \"*\". To suppress their output, run with\n"
+ "e.g. \"-a -FORMAT/AD\".\n"
+ "\n"
+ "FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
+ "\n"
+ " FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+ " FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+ " FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+ " FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+ " FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
+ " FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
+ " FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+ " FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
+ "\n"
+ "INFO annotation tags available:\n"
+ "\n"
+ " INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
+ " INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
+ " INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+ "* INFO/BQBZ .. Mann-Whitney U test of Base Quality Bias (Number=1,Type=Float)\n"
+ " INFO/FS .. Fisher's exact test P-value to detect strand bias (Number=1,Type=Float)\n"
+ "* INFO/IDV .. Maximum number of raw reads supporting an indel (Number=1,Type=Integer)\n"
+ "* INFO/IMF .. Maximum fraction of raw reads supporting an indel (Number=1,Type=Float)\n"
+ " INFO/MIN_PL_SUM\n"
+ " .. Sum of min PL across all samples before normalization, experimental (Number=1,Type=Integer)\n"
+ "* INFO/MQ0F .. Fraction of reads with zero mapping quality (Number=1,Type=Float)\n"
+ "* INFO/MQBZ .. Mann-Whitney U test of Mapping Quality Bias (Number=1,Type=Float)\n"
+ "* INFO/MQSBZ .. Mann-Whitney U-z test of Mapping Quality vs Strand Bias (Number=1,Type=Float)\n"
+ " INFO/NM .. Approximate average number of mismatches in ref and alt reads, experimental (Number=2,Type=Float)\n"
+ " INFO/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
+ "* INFO/RPBZ .. Mann-Whitney U test of Read Position Bias (Number=1,Type=Float)\n"
+ "* INFO/SCBZ .. Mann-Whitney U-z test of Soft-Clip Length Bias (Number=1,Type=Float)\n"
+ " INFO/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
+ "* INFO/SGB .. Segregation based metric, http://samtools.github.io/bcftools/rd-SegBias.pdf (Number=1,Type=Float)\n"
+ "* INFO/VDB .. Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (Number=1,Type=Float)\n"
+ "\n");
}
static void print_usage(FILE *fp, const mplp_conf_t *mplp)
" -o, --output FILE Write output to FILE [standard output]\n"
" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
" 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n"
- " -U, --mwu-u Use older probability scale for Mann-Whitney U test\n"
" --threads INT Use multithreading with INT worker threads [0]\n"
"\n"
"SNP/INDEL genotype likelihoods options:\n"
" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
fprintf(fp,
" --indel-size INT Approximate maximum indel size considered [%d]\n", mplp->indel_win_size);
+ fprintf(fp,
+ " --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n");
fprintf(fp,"\n");
fprintf(fp,
"Configuration profiles activated with -X, --config:\n"
mplp.n_threads = 0;
mplp.bsmpl = bam_smpl_init();
// the default to be changed in future, see also parse_format_flag()
- mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE;
+ mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB;
mplp.max_read_len = 500;
mplp.ambig_reads = B2B_DROP;
mplp.indel_win_size = 110;
{"gap-frac", required_argument, NULL, 'F'},
{"indel-bias", required_argument, NULL, 10},
{"indel-size", required_argument, NULL, 15},
+ {"indels-2.0", no_argument, NULL, 20},
{"tandem-qual", required_argument, NULL, 'h'},
{"skip-indels", no_argument, NULL, 'I'},
{"max-idepth", required_argument, NULL, 'L'},
{"platforms", required_argument, NULL, 'P'},
{"max-read-len", required_argument, NULL, 'M'},
{"config", required_argument, NULL, 'X'},
- {"mwu-u", no_argument, NULL, 'U'},
{"seed", required_argument, NULL, 13},
{"ambig-reads", required_argument, NULL, 14},
{"ar", required_argument, NULL, 14},
}
}
break;
+ case 20: mplp.indels_v20 = 1; break;
case 'A': use_orphan = 1; break;
case 'F': mplp.min_frac = atof(optarg); break;
case 'm': mplp.min_support = atoi(optarg); break;
list_annotations(bcftools_stderr);
return 1;
}
- mplp.fmt_flag |= parse_format_flag(optarg);
+ parse_format_flag(&mplp.fmt_flag,optarg);
break;
case 'M': mplp.max_read_len = atoi(optarg); break;
- case 'U': mplp.fmt_flag &= ~B2B_INFO_ZSCORE; break;
case 'X':
if (strcasecmp(optarg, "pacbio-ccs") == 0) {
mplp.min_frac = 0.1;
--- /dev/null
+/* read_consensus.c -- create and maintain consensus of reads
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: pd3@sanger
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE. */
+
+#include <assert.h>
+#include <math.h>
+#include "bcftools.h"
+#include "read_consensus.h"
+#include "cigar_state.h"
+#include "kheap.h"
+
+
+// Frequency arrays for each variant type
+#define NI 10 // number of alternative insertion sequences at one position in a single sample
+typedef struct
+{
+ char *nt16_seq[NI];
+ int len[NI];
+ int freq[NI];
+}
+ins_freq_t;
+
+typedef struct
+{
+ int len[NI];
+ int freq[NI];
+}
+del_freq_t;
+
+#define BF_DEL 5
+typedef struct
+{
+ int base[6]; // frequencies of A,C,G,T,N,deletion
+}
+base_freq_t;
+
+
+// Candidate variants for each interesting position to build consensus haplotypes
+enum variant_type { snv, ins, del, done };
+typedef struct
+{
+ enum variant_type vtype;
+ hts_pos_t pos; // variant position (reference sequence coordinates), indels follow VCF convention
+ int idx; // temporary 0-based index to rcns.cvar
+ int which, // base/ins/del in rcns.[base|ins|del]_freq array
+ depth; // coverage at the position
+ float af, af_dev; // variant allele frequency (just for debugging printout) and absolute af deviation from 0.5
+}
+candidate_var_t;
+static inline int cvar_not_preferred(candidate_var_t *a, candidate_var_t *b)
+{
+ if ( a->af_dev == b->af_dev ) return a->depth < b->depth ? 1 : 0;
+ return a->af_dev > b->af_dev ? 1 : 0;
+}
+KHEAP_INIT(cvh, candidate_var_t, cvar_not_preferred);
+typedef khp_cvh_t cvar_heap_t;
+
+#define MAX_NCVAR 8 // This results in alloc() of 2^MAX_NCVAR possible haplotypes
+#define NHAP (1<<MAX_NCVAR) // The number of possible haplotypes
+struct _read_cns_t
+{
+ hts_pos_t pos, beg, end; // current position and window boundaries (0-based, inclusive, ref seq coordinates)
+ int band, // maximum absolute deviation from the diagonal, used for BAQ alignment
+ max_del; // maximum deletion lentgth starting at the tested position
+ base_freq_t *base_freq; // frequency of each variant type: base, ins, del
+ ins_freq_t *ins_freq;
+ del_freq_t *del_freq;
+ char *stmp; // temporary array
+ int mstmp, mfreq; // allocated size of stmp and *_freq arrays
+ cvar_heap_t *cv_heap; // heap to maintain the top MAX_NCVAR variants
+ int ncvar; // cvar and cv_heap size
+ candidate_var_t cvar[MAX_NCVAR]; // candidate variants, sorted by position and type
+ int hap_freq[NHAP]; // haplotype frequencies
+ bam_pileup1_t *plp; // reads to construct consensus from
+ int nplp; // number of reads in the pileup
+ int cns_hap[2], ncns; // the top two consensus haplotypes and the number of haplotypes to use
+ int mcns; // the allocated size of cns.seq and cns.pos buffers
+ cns_seq_t cns[3]; // the consensus sequences to fill
+};
+
+void rcns_destroy(read_cns_t *rcns)
+{
+ if ( !rcns ) return;
+ int i,j;
+ for (i=0; i<rcns->mfreq; i++)
+ {
+ ins_freq_t *ifrq = &rcns->ins_freq[i];
+ for (j=0; j<NI && ifrq->nt16_seq[j]; j++) free(ifrq->nt16_seq[j]);
+ }
+ for (i=0; i<2; i++)
+ free(rcns->cns[i].seq);
+ free(rcns->ins_freq);
+ free(rcns->del_freq);
+ free(rcns->base_freq);
+ free(rcns->stmp);
+ khp_destroy(cvh,rcns->cv_heap);
+ free(rcns);
+}
+static int init_arrays(read_cns_t *rcns)
+{
+ int i,j,n = rcns->end - rcns->beg + 1;
+ if ( n > rcns->mfreq )
+ {
+ ins_freq_t *ifrq = (ins_freq_t*) realloc(rcns->ins_freq,sizeof(*rcns->ins_freq)*n);
+ if ( !ifrq ) return -1;
+ rcns->ins_freq = ifrq;
+ memset(ifrq+rcns->mfreq,0,sizeof(*rcns->ins_freq)*(n-rcns->mfreq));
+
+ del_freq_t *dfrq = (del_freq_t*) realloc(rcns->del_freq,sizeof(*rcns->del_freq)*n);
+ if ( !dfrq ) return -1;
+ rcns->del_freq = dfrq;
+ memset(dfrq+rcns->mfreq,0,sizeof(*rcns->del_freq)*(n-rcns->mfreq));
+
+ base_freq_t *bfrq = (base_freq_t*) realloc(rcns->base_freq,sizeof(*rcns->base_freq)*n);
+ if ( !bfrq ) return -1;
+ rcns->base_freq = bfrq;
+ memset(bfrq+rcns->mfreq,0,sizeof(*rcns->base_freq)*(n-rcns->mfreq));
+
+ rcns->mfreq = n;
+ }
+ memset(rcns->base_freq,0,sizeof(*rcns->base_freq)*n);
+ memset(rcns->del_freq,0,sizeof(*rcns->del_freq)*n);
+ for (i=0; i<n; i++)
+ {
+ ins_freq_t *ifrq = &rcns->ins_freq[i];
+ for (j=0; j<NI && ifrq->nt16_seq[j]; j++) free(ifrq->nt16_seq[j]);
+ }
+ memset(rcns->ins_freq,0,sizeof(*rcns->ins_freq)*n);
+ return 0;
+}
+int rcns_reset(read_cns_t *rcns, hts_pos_t pos, hts_pos_t beg, hts_pos_t end)
+{
+ rcns->band = 0;
+ rcns->pos = pos;
+ rcns->beg = beg;
+ rcns->end = end;
+ int i;
+ for (i=0; i<2; i++) rcns->cns[i].nseq = rcns->cns[i].ipos = 0;
+ // this should not be necessary if the caller did run all steps
+ while (rcns->cv_heap->ndat) khp_delete(cvh, rcns->cv_heap);
+ return init_arrays(rcns);
+}
+
+static inline void add_base(read_cns_t *rcns, int ref_pos, int nt16)
+{
+ int i = ref_pos - rcns->beg;
+ rcns->base_freq[i].base[seq_nt16_int[nt16]]++;
+}
+static void add_ins(read_cns_t *rcns, int ref_pos, int seq_pos, uint8_t *raw_seq, int len)
+{
+ int i = ref_pos - rcns->beg;
+ ins_freq_t *ifrq = &rcns->ins_freq[i];
+ char *str;
+ if ( rcns->mstmp < len )
+ {
+ str = realloc(rcns->stmp,len*sizeof(*str));
+ if ( !str ) return;
+ rcns->mstmp = len;
+ rcns->stmp = str;
+ }
+ else
+ str = rcns->stmp;
+ for (i=0; i<len; i++) str[i] = bam_seqi(raw_seq,i+seq_pos);
+
+ for (i=0; i<NI && ifrq->nt16_seq[i]; i++)
+ if ( ifrq->len[i]==len && !memcmp(ifrq->nt16_seq[i],str,len) ) break;
+
+ if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard
+
+ if ( !ifrq->nt16_seq[i] ) // new insertion
+ {
+ if ( !(ifrq->nt16_seq[i]=malloc(len)) ) return;
+ memcpy(ifrq->nt16_seq[i], str, len);
+ ifrq->len[i] = len;
+ }
+ ifrq->freq[i]++;
+}
+static void add_del(read_cns_t *rcns, int ref_pos, int len)
+{
+ int i = ref_pos - rcns->beg;
+ int j,n = rcns->end - rcns->beg + 1;
+ if ( i + len + 1 < n ) n = i + len + 1;
+ for (j=i+1; j<n; j++)
+ rcns->base_freq[j].base[BF_DEL]++;
+
+ del_freq_t *dfrq = &rcns->del_freq[i];
+ for (i=0; i<NI && dfrq->len[i]; i++)
+ if ( dfrq->len[i]==len ) break;
+
+ if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard
+
+ if ( !dfrq->len[i] ) dfrq->len[i] = len; // new deletion
+ dfrq->freq[i]++;
+}
+
+read_cns_t *rcns_init(hts_pos_t pos, hts_pos_t beg, hts_pos_t end)
+{
+ read_cns_t *rcns = (read_cns_t*) calloc(1,sizeof(read_cns_t));
+ rcns->pos = pos;
+ rcns->beg = beg;
+ rcns->end = end;
+ rcns->cv_heap = khp_init(cvh);
+ if ( init_arrays(rcns)!=0 )
+ {
+ rcns_destroy(rcns);
+ return NULL;
+ }
+ return rcns;
+}
+
+int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp)
+{
+ // save the reads for phasing, this can be called multiple times
+ rcns->plp = plp;
+ rcns->nplp = nplp;
+
+ // fill consensus arrays
+ int i,j,k, local_band_max = 0; // maximum absolute deviation from diagonal
+ for (i=0; i<nplp; i++) // for each read...
+ {
+ const bam_pileup1_t *p = plp + i;
+ bam1_t *b = p->b;
+ int x = b->core.pos; // ref coordinate
+ int y = 0; // seq coordinate
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ int local_band = 0; // current deviation from diagonal
+ for (k = 0; k < b->core.n_cigar; ++k)
+ {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int len = cigar[k] >> BAM_CIGAR_SHIFT;
+ if ( op==BAM_CSOFT_CLIP ) y += len;
+ else if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF )
+ {
+ if ( x<rcns->end && x+len>rcns->beg )
+ {
+ int j_beg = rcns->beg > x ? rcns->beg - x : 0; // how many bases to skip in the ref and qry
+ int j_end = rcns->end < x + len - 1 ? rcns->end - x : len - 1;
+ x += j_beg;
+ y += j_beg;
+ for (j=j_beg; j<=j_end; j++, x++, y++) add_base(rcns,x,bam_seqi(seq,y));
+ }
+ else
+ {
+ x += len;
+ y += len;
+ }
+ }
+ else if ( op==BAM_CINS )
+ {
+ if ( x>rcns->beg && x<rcns->end )
+ {
+ local_band += p->indel;
+ add_ins(rcns,x-1,y,seq,len); // x-1: one base before as in VCF
+ }
+ y += len;
+ }
+ else if ( op==BAM_CDEL )
+ {
+ if ( x>rcns->beg && x+len-1<=rcns->end )
+ {
+ local_band += -p->indel;
+ add_del(rcns,x-1,len); // x-1: one base before as in VCF
+ }
+ x += len;
+ }
+ else if ( op==BAM_CHARD_CLIP ) continue;
+ else error("rcns_set_reads todo: unknown cigar operator %d\n",op);
+ if ( local_band_max < local_band ) local_band_max = local_band;
+ }
+
+ // Track the biggest deviation +/- from diagonal, used in BAQ alignment step.
+ if ( rcns->band < local_band_max ) rcns->band = local_band_max;
+ }
+
+ return 0;
+}
+
+#if DEBUG_RCNS
+static void debug_print_base_freqs(read_cns_t *rcns, const char *ref)
+{
+ int i,j,k,n = rcns->end - rcns->beg + 1;
+ fprintf(stderr,"beg,end,pos=%d %d %d\n",(int)rcns->beg,(int)rcns->end,(int)rcns->pos);
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ del_freq_t *dfreq = rcns->del_freq;
+ for (i=0; i<n && ref[i]; i++)
+ {
+ fprintf(stderr,"%"PRIhts_pos" %c\t",rcns->beg+i+1,ref[i]);
+ for (j=0; j<6; j++)
+ fprintf(stderr,"\t%d%s",bfreq[i].base[j],ref[i]=="ACGTNi"[j]?"*":"");
+ fprintf(stderr,"\t");
+ for (j=0; j<NI && dfreq[i].len[j]; j++)
+ fprintf(stderr," -%d:%d",dfreq[i].len[j],dfreq[i].freq[j]);
+ fprintf(stderr,"\t");
+ for (j=0; j<NI && ifreq[i].len[j]; j++)
+ {
+ fprintf(stderr," +");
+ for (k=0; k<ifreq[i].len[j]; k++) fprintf(stderr,"%c",seq_nt16_str[(int)ifreq[i].nt16_seq[j][k]]);
+ fprintf(stderr,":%d",ifreq[i].freq[j]);
+ }
+ fprintf(stderr,"\n");
+ }
+}
+static const char *vtype2string(enum variant_type vtype)
+{
+ if ( vtype==snv ) return "snv";
+ if ( vtype==ins ) return "ins";
+ if ( vtype==del ) return "del";
+ return "???";
+}
+static void debug_print_candidate_variants(read_cns_t *rcns)
+{
+ int i;
+ fprintf(stderr,"Candidate variants:\n");
+ for (i=0; i<rcns->ncvar; i++)
+ {
+ candidate_var_t *var = &rcns->cvar[i];
+ fprintf(stderr,"\tvar%d pos=%"PRIhts_pos" idx=%d vtype=%s which=%d depth=%d af=%f af_dev=%f\n",
+ i,var->pos+1,var->idx,vtype2string(var->vtype),var->which,var->depth,var->af,var->af_dev);
+ }
+}
+static void debug_print_haplotype_frequency_spectrum(read_cns_t *rcns)
+{
+ int i,j;
+ fprintf(stderr,"Haplotype frequencies (bits from left correspond to var0,1,..):\n");
+ for (i=0; i<NHAP; i++)
+ {
+ if ( !rcns->hap_freq[i] ) continue;
+ fprintf(stderr,"\t%d: ",i);
+ for (j=0; j<rcns->ncvar; j++)
+ fprintf(stderr,"%d", i&(1<<j) ? 1 : 0);
+ fprintf(stderr,"\t%d\n", rcns->hap_freq[i]);
+ }
+}
+static void debug_print_consensus(read_cns_t *rcns, const char *ref)
+{
+ int i,j,n = rcns->end - rcns->beg + 1;
+ fprintf(stderr,"ref: ");
+ for (i=0; i<n && ref[i]; i++) fprintf(stderr,"%c",ref[i]);
+ fprintf(stderr,"\n");
+ for (i=0; i<2; i++)
+ {
+ if ( !rcns->cns[i].nseq ) break;
+ fprintf(stderr,"Consensus%d: ",i);
+ for (j=0; j<=rcns->cns[i].ipos; j++)
+ fprintf(stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]);
+ fprintf(stderr,"#");
+ for (; j<rcns->cns[i].nseq; j++)
+ fprintf(stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]);
+ fprintf(stderr,"\n");
+ }
+}
+#else
+#define debug_print_base_freqs(rcns,ref)
+#define debug_print_candidate_variants(rcns)
+#define debug_print_haplotype_frequency_spectrum(rcns)
+#define debug_print_consensus(rcns,ref)
+#endif
+
+static int cvar_pos_cmp(const void *aptr, const void *bptr)
+{
+ candidate_var_t *a = (candidate_var_t*)aptr;
+ candidate_var_t *b = (candidate_var_t*)bptr;
+ if ( a->pos < b->pos ) return -1;
+ if ( a->pos > b->pos ) return 1;
+ if ( a->vtype < b->vtype ) return -1;
+ if ( a->vtype > b->vtype ) return 1;
+ if ( a->which < b->which ) return -1;
+ if ( a->which > b->which ) return 1;
+ return 0;
+}
+static void register_variant(read_cns_t *rcns, enum variant_type vtype, int cns_pos, int which, int depth, float freq)
+{
+ cvar_heap_t *cv_heap = rcns->cv_heap;
+ if ( vtype==done )
+ {
+ rcns->ncvar = 0;
+ while (cv_heap->ndat)
+ {
+ rcns->cvar[rcns->ncvar++] = cv_heap->dat[0];
+ khp_delete(cvh,cv_heap);
+ }
+ // sort the variants by pos,type,which to make determination of haplotypes from reads faster
+ if ( rcns->ncvar )
+ qsort(rcns->cvar, rcns->ncvar, sizeof(*rcns->cvar), cvar_pos_cmp);
+ return;
+ }
+
+ candidate_var_t var;
+ var.pos = cns_pos + rcns->beg;
+ var.which = which;
+ var.vtype = vtype;
+ var.depth = depth;
+ var.af_dev = fabs(0.5-freq);
+ var.af = freq;
+
+ int free_slot;
+
+ // keep the number of variants small, maximum MAX_NCVAR
+ if ( rcns->ncvar==MAX_NCVAR )
+ {
+ if ( cvar_not_preferred(&var,&cv_heap->dat[0]) ) return; // no need to add, the new variant is worse than the heap's worst one
+ free_slot = cv_heap->dat[0].idx;
+ khp_delete(cvh,cv_heap);
+ }
+ else
+ free_slot = rcns->ncvar++;
+ var.idx = free_slot;
+ rcns->cvar[free_slot] = var;
+ khp_insert(cvh,cv_heap,&var);
+}
+
+// Identify candidate variant positions. (Note that homozygous variants are not considered
+// as those will be added trivially by taking the consensus base.) The detection limit is
+// for now hard-wired. This has only indirect effect on sensitivity, will just not contribute
+// to the consensus template when realigning.
+static int select_candidate_variants(read_cns_t *rcns, const char *ref)
+{
+ const float af_th = 0.1;
+ int i,j, n = rcns->end - rcns->beg + 1;
+ int max_ins_len = 0; // maximum total length of all insertions applied to allocate big enough buffers
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ del_freq_t *dfreq = rcns->del_freq;
+ for (i=0; i<n && ref[i]; i++)
+ {
+ for (j=0; j<NI && ifreq[i].len[j]; j++) max_ins_len += ifreq[i].len[j];
+
+ if ( i==rcns->pos - rcns->beg ) continue; // creating consensus from everything but the variants at the current position
+
+ int dp = 0;
+ for (j=0; j<4; j++) dp += bfreq[i].base[j];
+ for (j=0; j<NI && dfreq[i].len[j]; j++) dp += dfreq[i].freq[j];
+ for (j=0; j<NI && ifreq[i].len[j]; j++) dp += ifreq[i].freq[j];
+ float af = 0; // allele frequency
+ for (j=0; j<4; j++)
+ {
+ if ( !bfreq[i].base[j] || ref[i]=="ACGTN"[j] ) continue; // ref base or no coverage
+ af = (float)bfreq[i].base[j]/dp;
+ if ( af>af_th && af<(1-af_th) ) register_variant(rcns,snv,i,j,dp,af);
+ }
+ for (j=0; j<NI && dfreq[i].len[j]; j++)
+ {
+ af = (float)dfreq[i].freq[j]/dp;
+ if ( af>af_th && af<(1-af_th) ) register_variant(rcns,del,i,j,dp,af);
+ }
+ for (j=0; j<NI && ifreq[i].len[j]; j++)
+ {
+ af = (float)ifreq[i].freq[j]/dp;
+ if ( af>af_th && af<(1-af_th) ) register_variant(rcns,ins,i,j,dp,af);
+ }
+ }
+ register_variant(rcns,done,0,0,0,0); // finalize
+
+ // Reallocate buffers
+ if ( rcns->mcns < n + max_ins_len )
+ {
+ n += max_ins_len;
+ for (i=0; i<2; i++)
+ {
+ char *seq = (char*) realloc(rcns->cns[i].seq,sizeof(char)*n);
+ if ( !seq ) return -1;
+ rcns->cns[i].seq = seq;
+ }
+ rcns->mcns = n;
+ }
+
+ // Find the longest deletion at the query position
+ i = rcns->pos - rcns->beg;
+ rcns->max_del = 0;
+ for (j=0; j<NI && j<dfreq[i].len[j]; j++)
+ {
+ if ( rcns->max_del < dfreq[i].len[j] ) rcns->max_del = dfreq[i].len[j];
+ }
+
+ return 0;
+}
+static int create_haplotype_frequency_spectrum(read_cns_t *rcns)
+{
+ memset(rcns->hap_freq,0,sizeof(rcns->hap_freq));
+
+ int i;
+ for (i=0; i<rcns->nplp; i++) // for each read...
+ {
+ const bam_pileup1_t *p = rcns->plp + i;
+ cigar_state_t cigar;
+ cstate_init(&cigar,p->b);
+
+ int j,k,hap = 0;
+ for (j=0; j<rcns->ncvar; j++)
+ {
+ candidate_var_t *cvar = &rcns->cvar[j];
+ if ( cvar->vtype==snv )
+ {
+ int iseq = cstate_seek_op_fwd(&cigar, cvar->pos, BAM_CMATCH, NULL);
+ if ( iseq==-2 ) break;
+ if ( iseq==-1 ) continue;
+ int nt16 = bam_seqi(cigar.seq, iseq);
+ if ( seq_nt16_int[nt16]==cvar->which ) hap |= 1<<j;
+ }
+ else if ( cvar->vtype==ins )
+ {
+ int len;
+ ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg];
+ int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len);
+ if ( iseq==-2 ) break;
+ if ( iseq==-1 ) continue;
+ if ( len!=ifrq->len[cvar->which] ) continue;
+ for (k=0; k<ifrq->len[cvar->which]; k++)
+ if ( bam_seqi(cigar.seq,iseq+k)!=ifrq->nt16_seq[cvar->which][k] ) break;
+ if ( k==ifrq->len[cvar->which] ) hap |= 1<<j;
+ }
+ else if ( cvar->vtype==del )
+ {
+ int len;
+ del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg];
+ int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len);
+ if ( ret==-2 ) break;
+ if ( ret==-1 ) continue;
+ if ( len!=dfrq->len[cvar->which] ) continue;
+ hap |= 1<<j;
+ }
+ }
+ rcns->hap_freq[hap]++;
+ }
+ return 0;
+}
+
+typedef struct
+{
+ int haplotype, count;
+}
+ii_t;
+
+static int ii_cmp(const void *a, const void *b)
+{
+ if ( ((ii_t*)a)->count > ((ii_t*)b)->count ) return -1;
+ if ( ((ii_t*)a)->count < ((ii_t*)b)->count ) return 1;
+ return 0;
+}
+
+// Select two most common haplotypes trying to account for 1bp errors. Haplotypes
+// are represented as 8-bit numbers, each bit corresponds to one candidate variant.
+static int correct_haplotype_errors(read_cns_t *rcns)
+{
+ int i,j, tot = 0;
+ ii_t freq[NHAP];
+ for (i=0; i<NHAP; i++)
+ {
+ freq[i].haplotype = i;
+ freq[i].count = rcns->hap_freq[i];
+ tot += rcns->hap_freq[i];
+ }
+ qsort(freq, NHAP, sizeof(ii_t), ii_cmp); // sort haplotypes in descending order
+ for (i=NHAP-1; i>1; i--)
+ {
+ if ( !freq[i].count ) continue;
+ if ( freq[1].count > tot - freq[0].count - freq[1].count ) break; // the top2 hapotypes cannot change anymore
+
+ // Find a similar haplotype with the highest frequency. Assuming errors go in 0->1
+ // direction only and considering one error only.
+ int count = freq[i].count, max_hap = 0;
+ for (j=0; j<MAX_NCVAR; j++)
+ {
+ if ( !(freq[i].haplotype & (1U<<j)) ) continue; // j-th bit not set in this haplotype
+ int hap = freq[i].haplotype ^ (1U<<j); // toggle j-th bit
+ assert( hap>=0 && hap<NHAP );
+ if ( count < rcns->hap_freq[hap] ) count = rcns->hap_freq[hap], max_hap = hap;
+ }
+ if ( count == freq[i].count ) continue;
+
+ // Update frequency and sort the two modified elements
+ count = freq[i].count;
+ freq[i].count = 0;
+ rcns->hap_freq[freq[i].haplotype] = 0;
+ rcns->hap_freq[max_hap] += count;
+ for (j=i+1; j<NHAP; j++)
+ {
+ if ( !freq[j].count ) break;
+ ii_t tmp = freq[j-1]; freq[j-1] = freq[j]; freq[j] = tmp;
+ }
+ for (j=i-1; j>=0; j--)
+ {
+ if ( freq[j].haplotype==max_hap ) freq[j].count += count; // update the best matching haplotype
+ if ( freq[j].count < freq[j+1].count )
+ {
+ ii_t tmp = freq[j]; freq[j] = freq[j+1]; freq[j+1] = tmp;
+ }
+ }
+ }
+
+ // Use only one consensus if the next best haplotype is populated by less than 10% of reads
+ rcns->ncns = ((float)freq[1].count / (freq[0].count + freq[1].count) < 0.1) ? 1 : 2;
+
+ // Remove unused candidate variants from the top two haplotypes
+ int hap0 = freq[0].haplotype;
+ int hap1 = rcns->ncns==2 ? freq[1].haplotype : 0;
+ rcns->cns_hap[0] = 0;
+ rcns->cns_hap[1] = 0;
+ for (i=0,j=0; i<MAX_NCVAR; i++)
+ {
+ if ( !((hap0|hap1) & (1U<<i)) ) continue; // unused candidate variant, skip
+ if ( i!=j ) rcns->cvar[j] = rcns->cvar[i];
+ if ( hap0 & (1U<<i) ) rcns->cns_hap[0] |= 1U<<j;
+ if ( hap1 & (1U<<i) ) rcns->cns_hap[1] |= 1U<<j;
+ j++;
+ }
+ rcns->ncvar = j;
+
+#if DEBUG_RCNS
+ // This only matters for debugging print
+ memset(rcns->hap_freq,0,NHAP*sizeof(*rcns->hap_freq));
+ rcns->hap_freq[rcns->cns_hap[1]] = freq[1].count; // NB: the order matters when ncns==1
+ rcns->hap_freq[rcns->cns_hap[0]] = freq[0].count;
+#endif
+
+ return 0;
+}
+
+
+// Check how frequent are insertions adjacent to the j-th position. Note that reads with an
+// insertion usually increment also bfreq counts at this position, but not necessarily so,
+// therefore the counts are approximate
+static inline void apply_consensus_insertion(read_cns_t *rcns, cns_seq_t *cns, int j, int ivar)
+{
+ // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos
+ hts_pos_t ref_pos = rcns->beg + j;
+ if ( rcns->pos == ref_pos ) return;
+
+ // Only apply when there is no insertion at this position registered as a variant
+ while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos == ref_pos )
+ {
+ if ( rcns->cvar[ivar].vtype == ins ) return;
+ ivar++;
+ }
+
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ int k, nreads = 0;
+ for (k=0; k<BF_DEL; k++) nreads += bfreq[j].base[k];
+ int max_freq = 0, kmax = 0;
+ for (k=0; k<NI && ifreq[j].len[k]; k++)
+ if ( max_freq < ifreq[j].freq[k] ) max_freq = ifreq[j].freq[k], kmax = k;
+
+ // Include consensus insertion only if it has more than half of the reads
+ if ( nreads > max_freq*2 ) return;
+
+ int len = ifreq[j].len[kmax];
+ char *seq = ifreq[j].nt16_seq[kmax];
+ for (k=0; k<len; k++)
+ cns->seq[cns->nseq++] = seq_nt16_int[(int)seq[k]];
+}
+
+// For each position of the realignment window apply either the candidate variants
+// from ith haplotype or decide on the base/ins/del by majority vote
+static void create_consensus(read_cns_t *rcns, const char *ref, int ith)
+{
+ int n = rcns->end - rcns->beg + 1;
+ cns_seq_t *cns = &rcns->cns[ith];
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ del_freq_t *dfreq = rcns->del_freq;
+ hts_pos_t prev_pos = 0;
+ int j,k, ivar = 0;
+ for (j=0; j<n; j++)
+ {
+ hts_pos_t ref_pos = rcns->beg + j;
+ if ( rcns->pos == ref_pos ) cns->ipos = cns->nseq;
+
+ while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos < ref_pos ) ivar++;
+
+ if ( ivar >= rcns->ncvar || rcns->cvar[ivar].pos != ref_pos )
+ {
+ // This position is not recognised as a het variant so take the most frequent base, including
+ // a deletion if that is most frequent. However, for deleted bases make sure they are not part
+ // of the deletion that is being tested at this positions
+ int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]];
+ int nk = ( ref_pos < rcns->pos || ref_pos > rcns->pos + rcns->max_del ) ? BF_DEL+1 : BF_DEL;
+ for (k=0; k<nk; k++)
+ if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k;
+
+ if ( kmax!=BF_DEL ) // the most frequent base can be a deletion
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = kmax;
+ }
+ // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos
+ apply_consensus_insertion(rcns, cns, j, ivar);
+ continue;
+ }
+ int which = rcns->cvar[ivar].which;
+ if ( !(rcns->cns_hap[ith] & (1U<<ivar)) )
+ {
+ // This position has a heterozygous variant but not in this haplotype. Take the
+ // most frequent base different from the ivar-th variant
+ int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]];
+ for (k=0; k<6; k++)
+ {
+ if ( rcns->cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue;
+ if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k;
+ }
+ if ( kmax!=BF_DEL && (!cns->nseq || prev_pos != ref_pos) )
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = kmax;
+ }
+ apply_consensus_insertion(rcns, cns, j, ivar);
+ continue;
+ }
+ if ( rcns->cvar[ivar].vtype == snv )
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = which;
+ apply_consensus_insertion(rcns, cns, j, ivar);
+ continue;
+ }
+
+ // There can be multiple variants at this position, for example snv+ins. SNVs come first
+ // thanks to cvar_pos_cmp(), make sure the base has not been added already.
+ if ( !cns->nseq || prev_pos != ref_pos )
+ {
+ int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]];
+ for (k=0; k<6; k++)
+ {
+ if ( rcns->cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue;
+ if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k;
+ }
+ if ( kmax!=BF_DEL )
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = kmax;
+ }
+ }
+ if ( rcns->cvar[ivar].vtype == ins )
+ {
+ int len = ifreq[j].len[which];
+ char *seq = ifreq[j].nt16_seq[which];
+ for (k=0; k<len; k++)
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = seq_nt16_int[(int)seq[k]];
+ }
+ }
+ else if ( rcns->cvar[ivar].vtype == del ) j += dfreq[j].len[which];
+ }
+}
+
+// The algorithm:
+// 1. Identify heterozygous variant positions
+// 2. Sort variants by abs(variant_allele_freq-0.5) in descending order
+// 3. Take the top sorted variants (up to 8 to fit in uint8_t) and count the number of
+// corresponding reads to create frequency spectrum
+// 4. Correct errors, collapse to the requested number of haplotypes (consensus sequences)
+// using majority vote for the distribution tail
+cns_seq_t *rcns_get_consensus(read_cns_t *rcns, const char *ref)
+{
+ debug_print_base_freqs(rcns, ref);
+
+ select_candidate_variants(rcns, ref);
+ debug_print_candidate_variants(rcns);
+
+ if ( rcns->ncvar )
+ {
+ create_haplotype_frequency_spectrum(rcns);
+ debug_print_haplotype_frequency_spectrum(rcns);
+
+ correct_haplotype_errors(rcns);
+ debug_print_candidate_variants(rcns);
+ debug_print_haplotype_frequency_spectrum(rcns);
+ }
+ else
+ {
+ rcns->cns_hap[0] = 0;
+ rcns->ncns = 1;
+ }
+
+ // create consensus
+ int i;
+ for (i=0; i<rcns->ncns; i++) create_consensus(rcns,ref,i);
+ debug_print_consensus(rcns,ref);
+
+ return rcns->cns;
+}
--- /dev/null
+#include "bcftools.pysam.h"
+
+/* read_consensus.c -- create and maintain consensus of reads
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: pd3@sanger
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE. */
+
+#include <assert.h>
+#include <math.h>
+#include "bcftools.h"
+#include "read_consensus.h"
+#include "cigar_state.h"
+#include "kheap.h"
+
+
+// Frequency arrays for each variant type
+#define NI 10 // number of alternative insertion sequences at one position in a single sample
+typedef struct
+{
+ char *nt16_seq[NI];
+ int len[NI];
+ int freq[NI];
+}
+ins_freq_t;
+
+typedef struct
+{
+ int len[NI];
+ int freq[NI];
+}
+del_freq_t;
+
+#define BF_DEL 5
+typedef struct
+{
+ int base[6]; // frequencies of A,C,G,T,N,deletion
+}
+base_freq_t;
+
+
+// Candidate variants for each interesting position to build consensus haplotypes
+enum variant_type { snv, ins, del, done };
+typedef struct
+{
+ enum variant_type vtype;
+ hts_pos_t pos; // variant position (reference sequence coordinates), indels follow VCF convention
+ int idx; // temporary 0-based index to rcns.cvar
+ int which, // base/ins/del in rcns.[base|ins|del]_freq array
+ depth; // coverage at the position
+ float af, af_dev; // variant allele frequency (just for debugging printout) and absolute af deviation from 0.5
+}
+candidate_var_t;
+static inline int cvar_not_preferred(candidate_var_t *a, candidate_var_t *b)
+{
+ if ( a->af_dev == b->af_dev ) return a->depth < b->depth ? 1 : 0;
+ return a->af_dev > b->af_dev ? 1 : 0;
+}
+KHEAP_INIT(cvh, candidate_var_t, cvar_not_preferred);
+typedef khp_cvh_t cvar_heap_t;
+
+#define MAX_NCVAR 8 // This results in alloc() of 2^MAX_NCVAR possible haplotypes
+#define NHAP (1<<MAX_NCVAR) // The number of possible haplotypes
+struct _read_cns_t
+{
+ hts_pos_t pos, beg, end; // current position and window boundaries (0-based, inclusive, ref seq coordinates)
+ int band, // maximum absolute deviation from the diagonal, used for BAQ alignment
+ max_del; // maximum deletion lentgth starting at the tested position
+ base_freq_t *base_freq; // frequency of each variant type: base, ins, del
+ ins_freq_t *ins_freq;
+ del_freq_t *del_freq;
+ char *stmp; // temporary array
+ int mstmp, mfreq; // allocated size of stmp and *_freq arrays
+ cvar_heap_t *cv_heap; // heap to maintain the top MAX_NCVAR variants
+ int ncvar; // cvar and cv_heap size
+ candidate_var_t cvar[MAX_NCVAR]; // candidate variants, sorted by position and type
+ int hap_freq[NHAP]; // haplotype frequencies
+ bam_pileup1_t *plp; // reads to construct consensus from
+ int nplp; // number of reads in the pileup
+ int cns_hap[2], ncns; // the top two consensus haplotypes and the number of haplotypes to use
+ int mcns; // the allocated size of cns.seq and cns.pos buffers
+ cns_seq_t cns[3]; // the consensus sequences to fill
+};
+
+void rcns_destroy(read_cns_t *rcns)
+{
+ if ( !rcns ) return;
+ int i,j;
+ for (i=0; i<rcns->mfreq; i++)
+ {
+ ins_freq_t *ifrq = &rcns->ins_freq[i];
+ for (j=0; j<NI && ifrq->nt16_seq[j]; j++) free(ifrq->nt16_seq[j]);
+ }
+ for (i=0; i<2; i++)
+ free(rcns->cns[i].seq);
+ free(rcns->ins_freq);
+ free(rcns->del_freq);
+ free(rcns->base_freq);
+ free(rcns->stmp);
+ khp_destroy(cvh,rcns->cv_heap);
+ free(rcns);
+}
+static int init_arrays(read_cns_t *rcns)
+{
+ int i,j,n = rcns->end - rcns->beg + 1;
+ if ( n > rcns->mfreq )
+ {
+ ins_freq_t *ifrq = (ins_freq_t*) realloc(rcns->ins_freq,sizeof(*rcns->ins_freq)*n);
+ if ( !ifrq ) return -1;
+ rcns->ins_freq = ifrq;
+ memset(ifrq+rcns->mfreq,0,sizeof(*rcns->ins_freq)*(n-rcns->mfreq));
+
+ del_freq_t *dfrq = (del_freq_t*) realloc(rcns->del_freq,sizeof(*rcns->del_freq)*n);
+ if ( !dfrq ) return -1;
+ rcns->del_freq = dfrq;
+ memset(dfrq+rcns->mfreq,0,sizeof(*rcns->del_freq)*(n-rcns->mfreq));
+
+ base_freq_t *bfrq = (base_freq_t*) realloc(rcns->base_freq,sizeof(*rcns->base_freq)*n);
+ if ( !bfrq ) return -1;
+ rcns->base_freq = bfrq;
+ memset(bfrq+rcns->mfreq,0,sizeof(*rcns->base_freq)*(n-rcns->mfreq));
+
+ rcns->mfreq = n;
+ }
+ memset(rcns->base_freq,0,sizeof(*rcns->base_freq)*n);
+ memset(rcns->del_freq,0,sizeof(*rcns->del_freq)*n);
+ for (i=0; i<n; i++)
+ {
+ ins_freq_t *ifrq = &rcns->ins_freq[i];
+ for (j=0; j<NI && ifrq->nt16_seq[j]; j++) free(ifrq->nt16_seq[j]);
+ }
+ memset(rcns->ins_freq,0,sizeof(*rcns->ins_freq)*n);
+ return 0;
+}
+int rcns_reset(read_cns_t *rcns, hts_pos_t pos, hts_pos_t beg, hts_pos_t end)
+{
+ rcns->band = 0;
+ rcns->pos = pos;
+ rcns->beg = beg;
+ rcns->end = end;
+ int i;
+ for (i=0; i<2; i++) rcns->cns[i].nseq = rcns->cns[i].ipos = 0;
+ // this should not be necessary if the caller did run all steps
+ while (rcns->cv_heap->ndat) khp_delete(cvh, rcns->cv_heap);
+ return init_arrays(rcns);
+}
+
+static inline void add_base(read_cns_t *rcns, int ref_pos, int nt16)
+{
+ int i = ref_pos - rcns->beg;
+ rcns->base_freq[i].base[seq_nt16_int[nt16]]++;
+}
+static void add_ins(read_cns_t *rcns, int ref_pos, int seq_pos, uint8_t *raw_seq, int len)
+{
+ int i = ref_pos - rcns->beg;
+ ins_freq_t *ifrq = &rcns->ins_freq[i];
+ char *str;
+ if ( rcns->mstmp < len )
+ {
+ str = realloc(rcns->stmp,len*sizeof(*str));
+ if ( !str ) return;
+ rcns->mstmp = len;
+ rcns->stmp = str;
+ }
+ else
+ str = rcns->stmp;
+ for (i=0; i<len; i++) str[i] = bam_seqi(raw_seq,i+seq_pos);
+
+ for (i=0; i<NI && ifrq->nt16_seq[i]; i++)
+ if ( ifrq->len[i]==len && !memcmp(ifrq->nt16_seq[i],str,len) ) break;
+
+ if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard
+
+ if ( !ifrq->nt16_seq[i] ) // new insertion
+ {
+ if ( !(ifrq->nt16_seq[i]=malloc(len)) ) return;
+ memcpy(ifrq->nt16_seq[i], str, len);
+ ifrq->len[i] = len;
+ }
+ ifrq->freq[i]++;
+}
+static void add_del(read_cns_t *rcns, int ref_pos, int len)
+{
+ int i = ref_pos - rcns->beg;
+ int j,n = rcns->end - rcns->beg + 1;
+ if ( i + len + 1 < n ) n = i + len + 1;
+ for (j=i+1; j<n; j++)
+ rcns->base_freq[j].base[BF_DEL]++;
+
+ del_freq_t *dfrq = &rcns->del_freq[i];
+ for (i=0; i<NI && dfrq->len[i]; i++)
+ if ( dfrq->len[i]==len ) break;
+
+ if ( i>=NI ) return; // too many choices, typically homopolymers in long reads; discard
+
+ if ( !dfrq->len[i] ) dfrq->len[i] = len; // new deletion
+ dfrq->freq[i]++;
+}
+
+read_cns_t *rcns_init(hts_pos_t pos, hts_pos_t beg, hts_pos_t end)
+{
+ read_cns_t *rcns = (read_cns_t*) calloc(1,sizeof(read_cns_t));
+ rcns->pos = pos;
+ rcns->beg = beg;
+ rcns->end = end;
+ rcns->cv_heap = khp_init(cvh);
+ if ( init_arrays(rcns)!=0 )
+ {
+ rcns_destroy(rcns);
+ return NULL;
+ }
+ return rcns;
+}
+
+int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp)
+{
+ // save the reads for phasing, this can be called multiple times
+ rcns->plp = plp;
+ rcns->nplp = nplp;
+
+ // fill consensus arrays
+ int i,j,k, local_band_max = 0; // maximum absolute deviation from diagonal
+ for (i=0; i<nplp; i++) // for each read...
+ {
+ const bam_pileup1_t *p = plp + i;
+ bam1_t *b = p->b;
+ int x = b->core.pos; // ref coordinate
+ int y = 0; // seq coordinate
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ int local_band = 0; // current deviation from diagonal
+ for (k = 0; k < b->core.n_cigar; ++k)
+ {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int len = cigar[k] >> BAM_CIGAR_SHIFT;
+ if ( op==BAM_CSOFT_CLIP ) y += len;
+ else if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF )
+ {
+ if ( x<rcns->end && x+len>rcns->beg )
+ {
+ int j_beg = rcns->beg > x ? rcns->beg - x : 0; // how many bases to skip in the ref and qry
+ int j_end = rcns->end < x + len - 1 ? rcns->end - x : len - 1;
+ x += j_beg;
+ y += j_beg;
+ for (j=j_beg; j<=j_end; j++, x++, y++) add_base(rcns,x,bam_seqi(seq,y));
+ }
+ else
+ {
+ x += len;
+ y += len;
+ }
+ }
+ else if ( op==BAM_CINS )
+ {
+ if ( x>rcns->beg && x<rcns->end )
+ {
+ local_band += p->indel;
+ add_ins(rcns,x-1,y,seq,len); // x-1: one base before as in VCF
+ }
+ y += len;
+ }
+ else if ( op==BAM_CDEL )
+ {
+ if ( x>rcns->beg && x+len-1<=rcns->end )
+ {
+ local_band += -p->indel;
+ add_del(rcns,x-1,len); // x-1: one base before as in VCF
+ }
+ x += len;
+ }
+ else if ( op==BAM_CHARD_CLIP ) continue;
+ else error("rcns_set_reads todo: unknown cigar operator %d\n",op);
+ if ( local_band_max < local_band ) local_band_max = local_band;
+ }
+
+ // Track the biggest deviation +/- from diagonal, used in BAQ alignment step.
+ if ( rcns->band < local_band_max ) rcns->band = local_band_max;
+ }
+
+ return 0;
+}
+
+#if DEBUG_RCNS
+static void debug_print_base_freqs(read_cns_t *rcns, const char *ref)
+{
+ int i,j,k,n = rcns->end - rcns->beg + 1;
+ fprintf(bcftools_stderr,"beg,end,pos=%d %d %d\n",(int)rcns->beg,(int)rcns->end,(int)rcns->pos);
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ del_freq_t *dfreq = rcns->del_freq;
+ for (i=0; i<n && ref[i]; i++)
+ {
+ fprintf(bcftools_stderr,"%"PRIhts_pos" %c\t",rcns->beg+i+1,ref[i]);
+ for (j=0; j<6; j++)
+ fprintf(bcftools_stderr,"\t%d%s",bfreq[i].base[j],ref[i]=="ACGTNi"[j]?"*":"");
+ fprintf(bcftools_stderr,"\t");
+ for (j=0; j<NI && dfreq[i].len[j]; j++)
+ fprintf(bcftools_stderr," -%d:%d",dfreq[i].len[j],dfreq[i].freq[j]);
+ fprintf(bcftools_stderr,"\t");
+ for (j=0; j<NI && ifreq[i].len[j]; j++)
+ {
+ fprintf(bcftools_stderr," +");
+ for (k=0; k<ifreq[i].len[j]; k++) fprintf(bcftools_stderr,"%c",seq_nt16_str[(int)ifreq[i].nt16_seq[j][k]]);
+ fprintf(bcftools_stderr,":%d",ifreq[i].freq[j]);
+ }
+ fprintf(bcftools_stderr,"\n");
+ }
+}
+static const char *vtype2string(enum variant_type vtype)
+{
+ if ( vtype==snv ) return "snv";
+ if ( vtype==ins ) return "ins";
+ if ( vtype==del ) return "del";
+ return "???";
+}
+static void debug_print_candidate_variants(read_cns_t *rcns)
+{
+ int i;
+ fprintf(bcftools_stderr,"Candidate variants:\n");
+ for (i=0; i<rcns->ncvar; i++)
+ {
+ candidate_var_t *var = &rcns->cvar[i];
+ fprintf(bcftools_stderr,"\tvar%d pos=%"PRIhts_pos" idx=%d vtype=%s which=%d depth=%d af=%f af_dev=%f\n",
+ i,var->pos+1,var->idx,vtype2string(var->vtype),var->which,var->depth,var->af,var->af_dev);
+ }
+}
+static void debug_print_haplotype_frequency_spectrum(read_cns_t *rcns)
+{
+ int i,j;
+ fprintf(bcftools_stderr,"Haplotype frequencies (bits from left correspond to var0,1,..):\n");
+ for (i=0; i<NHAP; i++)
+ {
+ if ( !rcns->hap_freq[i] ) continue;
+ fprintf(bcftools_stderr,"\t%d: ",i);
+ for (j=0; j<rcns->ncvar; j++)
+ fprintf(bcftools_stderr,"%d", i&(1<<j) ? 1 : 0);
+ fprintf(bcftools_stderr,"\t%d\n", rcns->hap_freq[i]);
+ }
+}
+static void debug_print_consensus(read_cns_t *rcns, const char *ref)
+{
+ int i,j,n = rcns->end - rcns->beg + 1;
+ fprintf(bcftools_stderr,"ref: ");
+ for (i=0; i<n && ref[i]; i++) fprintf(bcftools_stderr,"%c",ref[i]);
+ fprintf(bcftools_stderr,"\n");
+ for (i=0; i<2; i++)
+ {
+ if ( !rcns->cns[i].nseq ) break;
+ fprintf(bcftools_stderr,"Consensus%d: ",i);
+ for (j=0; j<=rcns->cns[i].ipos; j++)
+ fprintf(bcftools_stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]);
+ fprintf(bcftools_stderr,"#");
+ for (; j<rcns->cns[i].nseq; j++)
+ fprintf(bcftools_stderr,"%c","ACGTN"[(int)rcns->cns[i].seq[j]]);
+ fprintf(bcftools_stderr,"\n");
+ }
+}
+#else
+#define debug_print_base_freqs(rcns,ref)
+#define debug_print_candidate_variants(rcns)
+#define debug_print_haplotype_frequency_spectrum(rcns)
+#define debug_print_consensus(rcns,ref)
+#endif
+
+static int cvar_pos_cmp(const void *aptr, const void *bptr)
+{
+ candidate_var_t *a = (candidate_var_t*)aptr;
+ candidate_var_t *b = (candidate_var_t*)bptr;
+ if ( a->pos < b->pos ) return -1;
+ if ( a->pos > b->pos ) return 1;
+ if ( a->vtype < b->vtype ) return -1;
+ if ( a->vtype > b->vtype ) return 1;
+ if ( a->which < b->which ) return -1;
+ if ( a->which > b->which ) return 1;
+ return 0;
+}
+static void register_variant(read_cns_t *rcns, enum variant_type vtype, int cns_pos, int which, int depth, float freq)
+{
+ cvar_heap_t *cv_heap = rcns->cv_heap;
+ if ( vtype==done )
+ {
+ rcns->ncvar = 0;
+ while (cv_heap->ndat)
+ {
+ rcns->cvar[rcns->ncvar++] = cv_heap->dat[0];
+ khp_delete(cvh,cv_heap);
+ }
+ // sort the variants by pos,type,which to make determination of haplotypes from reads faster
+ if ( rcns->ncvar )
+ qsort(rcns->cvar, rcns->ncvar, sizeof(*rcns->cvar), cvar_pos_cmp);
+ return;
+ }
+
+ candidate_var_t var;
+ var.pos = cns_pos + rcns->beg;
+ var.which = which;
+ var.vtype = vtype;
+ var.depth = depth;
+ var.af_dev = fabs(0.5-freq);
+ var.af = freq;
+
+ int free_slot;
+
+ // keep the number of variants small, maximum MAX_NCVAR
+ if ( rcns->ncvar==MAX_NCVAR )
+ {
+ if ( cvar_not_preferred(&var,&cv_heap->dat[0]) ) return; // no need to add, the new variant is worse than the heap's worst one
+ free_slot = cv_heap->dat[0].idx;
+ khp_delete(cvh,cv_heap);
+ }
+ else
+ free_slot = rcns->ncvar++;
+ var.idx = free_slot;
+ rcns->cvar[free_slot] = var;
+ khp_insert(cvh,cv_heap,&var);
+}
+
+// Identify candidate variant positions. (Note that homozygous variants are not considered
+// as those will be added trivially by taking the consensus base.) The detection limit is
+// for now hard-wired. This has only indirect effect on sensitivity, will just not contribute
+// to the consensus template when realigning.
+static int select_candidate_variants(read_cns_t *rcns, const char *ref)
+{
+ const float af_th = 0.1;
+ int i,j, n = rcns->end - rcns->beg + 1;
+ int max_ins_len = 0; // maximum total length of all insertions applied to allocate big enough buffers
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ del_freq_t *dfreq = rcns->del_freq;
+ for (i=0; i<n && ref[i]; i++)
+ {
+ for (j=0; j<NI && ifreq[i].len[j]; j++) max_ins_len += ifreq[i].len[j];
+
+ if ( i==rcns->pos - rcns->beg ) continue; // creating consensus from everything but the variants at the current position
+
+ int dp = 0;
+ for (j=0; j<4; j++) dp += bfreq[i].base[j];
+ for (j=0; j<NI && dfreq[i].len[j]; j++) dp += dfreq[i].freq[j];
+ for (j=0; j<NI && ifreq[i].len[j]; j++) dp += ifreq[i].freq[j];
+ float af = 0; // allele frequency
+ for (j=0; j<4; j++)
+ {
+ if ( !bfreq[i].base[j] || ref[i]=="ACGTN"[j] ) continue; // ref base or no coverage
+ af = (float)bfreq[i].base[j]/dp;
+ if ( af>af_th && af<(1-af_th) ) register_variant(rcns,snv,i,j,dp,af);
+ }
+ for (j=0; j<NI && dfreq[i].len[j]; j++)
+ {
+ af = (float)dfreq[i].freq[j]/dp;
+ if ( af>af_th && af<(1-af_th) ) register_variant(rcns,del,i,j,dp,af);
+ }
+ for (j=0; j<NI && ifreq[i].len[j]; j++)
+ {
+ af = (float)ifreq[i].freq[j]/dp;
+ if ( af>af_th && af<(1-af_th) ) register_variant(rcns,ins,i,j,dp,af);
+ }
+ }
+ register_variant(rcns,done,0,0,0,0); // finalize
+
+ // Reallocate buffers
+ if ( rcns->mcns < n + max_ins_len )
+ {
+ n += max_ins_len;
+ for (i=0; i<2; i++)
+ {
+ char *seq = (char*) realloc(rcns->cns[i].seq,sizeof(char)*n);
+ if ( !seq ) return -1;
+ rcns->cns[i].seq = seq;
+ }
+ rcns->mcns = n;
+ }
+
+ // Find the longest deletion at the query position
+ i = rcns->pos - rcns->beg;
+ rcns->max_del = 0;
+ for (j=0; j<NI && j<dfreq[i].len[j]; j++)
+ {
+ if ( rcns->max_del < dfreq[i].len[j] ) rcns->max_del = dfreq[i].len[j];
+ }
+
+ return 0;
+}
+static int create_haplotype_frequency_spectrum(read_cns_t *rcns)
+{
+ memset(rcns->hap_freq,0,sizeof(rcns->hap_freq));
+
+ int i;
+ for (i=0; i<rcns->nplp; i++) // for each read...
+ {
+ const bam_pileup1_t *p = rcns->plp + i;
+ cigar_state_t cigar;
+ cstate_init(&cigar,p->b);
+
+ int j,k,hap = 0;
+ for (j=0; j<rcns->ncvar; j++)
+ {
+ candidate_var_t *cvar = &rcns->cvar[j];
+ if ( cvar->vtype==snv )
+ {
+ int iseq = cstate_seek_op_fwd(&cigar, cvar->pos, BAM_CMATCH, NULL);
+ if ( iseq==-2 ) break;
+ if ( iseq==-1 ) continue;
+ int nt16 = bam_seqi(cigar.seq, iseq);
+ if ( seq_nt16_int[nt16]==cvar->which ) hap |= 1<<j;
+ }
+ else if ( cvar->vtype==ins )
+ {
+ int len;
+ ins_freq_t *ifrq = &rcns->ins_freq[cvar->pos - rcns->beg];
+ int iseq = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CINS, &len);
+ if ( iseq==-2 ) break;
+ if ( iseq==-1 ) continue;
+ if ( len!=ifrq->len[cvar->which] ) continue;
+ for (k=0; k<ifrq->len[cvar->which]; k++)
+ if ( bam_seqi(cigar.seq,iseq+k)!=ifrq->nt16_seq[cvar->which][k] ) break;
+ if ( k==ifrq->len[cvar->which] ) hap |= 1<<j;
+ }
+ else if ( cvar->vtype==del )
+ {
+ int len;
+ del_freq_t *dfrq = &rcns->del_freq[cvar->pos - rcns->beg];
+ int ret = cstate_seek_op_fwd(&cigar, cvar->pos+1, BAM_CDEL, &len);
+ if ( ret==-2 ) break;
+ if ( ret==-1 ) continue;
+ if ( len!=dfrq->len[cvar->which] ) continue;
+ hap |= 1<<j;
+ }
+ }
+ rcns->hap_freq[hap]++;
+ }
+ return 0;
+}
+
+typedef struct
+{
+ int haplotype, count;
+}
+ii_t;
+
+static int ii_cmp(const void *a, const void *b)
+{
+ if ( ((ii_t*)a)->count > ((ii_t*)b)->count ) return -1;
+ if ( ((ii_t*)a)->count < ((ii_t*)b)->count ) return 1;
+ return 0;
+}
+
+// Select two most common haplotypes trying to account for 1bp errors. Haplotypes
+// are represented as 8-bit numbers, each bit corresponds to one candidate variant.
+static int correct_haplotype_errors(read_cns_t *rcns)
+{
+ int i,j, tot = 0;
+ ii_t freq[NHAP];
+ for (i=0; i<NHAP; i++)
+ {
+ freq[i].haplotype = i;
+ freq[i].count = rcns->hap_freq[i];
+ tot += rcns->hap_freq[i];
+ }
+ qsort(freq, NHAP, sizeof(ii_t), ii_cmp); // sort haplotypes in descending order
+ for (i=NHAP-1; i>1; i--)
+ {
+ if ( !freq[i].count ) continue;
+ if ( freq[1].count > tot - freq[0].count - freq[1].count ) break; // the top2 hapotypes cannot change anymore
+
+ // Find a similar haplotype with the highest frequency. Assuming errors go in 0->1
+ // direction only and considering one error only.
+ int count = freq[i].count, max_hap = 0;
+ for (j=0; j<MAX_NCVAR; j++)
+ {
+ if ( !(freq[i].haplotype & (1U<<j)) ) continue; // j-th bit not set in this haplotype
+ int hap = freq[i].haplotype ^ (1U<<j); // toggle j-th bit
+ assert( hap>=0 && hap<NHAP );
+ if ( count < rcns->hap_freq[hap] ) count = rcns->hap_freq[hap], max_hap = hap;
+ }
+ if ( count == freq[i].count ) continue;
+
+ // Update frequency and sort the two modified elements
+ count = freq[i].count;
+ freq[i].count = 0;
+ rcns->hap_freq[freq[i].haplotype] = 0;
+ rcns->hap_freq[max_hap] += count;
+ for (j=i+1; j<NHAP; j++)
+ {
+ if ( !freq[j].count ) break;
+ ii_t tmp = freq[j-1]; freq[j-1] = freq[j]; freq[j] = tmp;
+ }
+ for (j=i-1; j>=0; j--)
+ {
+ if ( freq[j].haplotype==max_hap ) freq[j].count += count; // update the best matching haplotype
+ if ( freq[j].count < freq[j+1].count )
+ {
+ ii_t tmp = freq[j]; freq[j] = freq[j+1]; freq[j+1] = tmp;
+ }
+ }
+ }
+
+ // Use only one consensus if the next best haplotype is populated by less than 10% of reads
+ rcns->ncns = ((float)freq[1].count / (freq[0].count + freq[1].count) < 0.1) ? 1 : 2;
+
+ // Remove unused candidate variants from the top two haplotypes
+ int hap0 = freq[0].haplotype;
+ int hap1 = rcns->ncns==2 ? freq[1].haplotype : 0;
+ rcns->cns_hap[0] = 0;
+ rcns->cns_hap[1] = 0;
+ for (i=0,j=0; i<MAX_NCVAR; i++)
+ {
+ if ( !((hap0|hap1) & (1U<<i)) ) continue; // unused candidate variant, skip
+ if ( i!=j ) rcns->cvar[j] = rcns->cvar[i];
+ if ( hap0 & (1U<<i) ) rcns->cns_hap[0] |= 1U<<j;
+ if ( hap1 & (1U<<i) ) rcns->cns_hap[1] |= 1U<<j;
+ j++;
+ }
+ rcns->ncvar = j;
+
+#if DEBUG_RCNS
+ // This only matters for debugging print
+ memset(rcns->hap_freq,0,NHAP*sizeof(*rcns->hap_freq));
+ rcns->hap_freq[rcns->cns_hap[1]] = freq[1].count; // NB: the order matters when ncns==1
+ rcns->hap_freq[rcns->cns_hap[0]] = freq[0].count;
+#endif
+
+ return 0;
+}
+
+
+// Check how frequent are insertions adjacent to the j-th position. Note that reads with an
+// insertion usually increment also bfreq counts at this position, but not necessarily so,
+// therefore the counts are approximate
+static inline void apply_consensus_insertion(read_cns_t *rcns, cns_seq_t *cns, int j, int ivar)
+{
+ // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos
+ hts_pos_t ref_pos = rcns->beg + j;
+ if ( rcns->pos == ref_pos ) return;
+
+ // Only apply when there is no insertion at this position registered as a variant
+ while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos == ref_pos )
+ {
+ if ( rcns->cvar[ivar].vtype == ins ) return;
+ ivar++;
+ }
+
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ int k, nreads = 0;
+ for (k=0; k<BF_DEL; k++) nreads += bfreq[j].base[k];
+ int max_freq = 0, kmax = 0;
+ for (k=0; k<NI && ifreq[j].len[k]; k++)
+ if ( max_freq < ifreq[j].freq[k] ) max_freq = ifreq[j].freq[k], kmax = k;
+
+ // Include consensus insertion only if it has more than half of the reads
+ if ( nreads > max_freq*2 ) return;
+
+ int len = ifreq[j].len[kmax];
+ char *seq = ifreq[j].nt16_seq[kmax];
+ for (k=0; k<len; k++)
+ cns->seq[cns->nseq++] = seq_nt16_int[(int)seq[k]];
+}
+
+// For each position of the realignment window apply either the candidate variants
+// from ith haplotype or decide on the base/ins/del by majority vote
+static void create_consensus(read_cns_t *rcns, const char *ref, int ith)
+{
+ int n = rcns->end - rcns->beg + 1;
+ cns_seq_t *cns = &rcns->cns[ith];
+ base_freq_t *bfreq = rcns->base_freq;
+ ins_freq_t *ifreq = rcns->ins_freq;
+ del_freq_t *dfreq = rcns->del_freq;
+ hts_pos_t prev_pos = 0;
+ int j,k, ivar = 0;
+ for (j=0; j<n; j++)
+ {
+ hts_pos_t ref_pos = rcns->beg + j;
+ if ( rcns->pos == ref_pos ) cns->ipos = cns->nseq;
+
+ while ( ivar < rcns->ncvar && rcns->cvar[ivar].pos < ref_pos ) ivar++;
+
+ if ( ivar >= rcns->ncvar || rcns->cvar[ivar].pos != ref_pos )
+ {
+ // This position is not recognised as a het variant so take the most frequent base, including
+ // a deletion if that is most frequent. However, for deleted bases make sure they are not part
+ // of the deletion that is being tested at this positions
+ int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]];
+ int nk = ( ref_pos < rcns->pos || ref_pos > rcns->pos + rcns->max_del ) ? BF_DEL+1 : BF_DEL;
+ for (k=0; k<nk; k++)
+ if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k;
+
+ if ( kmax!=BF_DEL ) // the most frequent base can be a deletion
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = kmax;
+ }
+ // Only apply consensus insertions that are not being tested by bam2bcf_iaux, i.e. not at the current pos
+ apply_consensus_insertion(rcns, cns, j, ivar);
+ continue;
+ }
+ int which = rcns->cvar[ivar].which;
+ if ( !(rcns->cns_hap[ith] & (1U<<ivar)) )
+ {
+ // This position has a heterozygous variant but not in this haplotype. Take the
+ // most frequent base different from the ivar-th variant
+ int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]];
+ for (k=0; k<6; k++)
+ {
+ if ( rcns->cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue;
+ if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k;
+ }
+ if ( kmax!=BF_DEL && (!cns->nseq || prev_pos != ref_pos) )
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = kmax;
+ }
+ apply_consensus_insertion(rcns, cns, j, ivar);
+ continue;
+ }
+ if ( rcns->cvar[ivar].vtype == snv )
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = which;
+ apply_consensus_insertion(rcns, cns, j, ivar);
+ continue;
+ }
+
+ // There can be multiple variants at this position, for example snv+ins. SNVs come first
+ // thanks to cvar_pos_cmp(), make sure the base has not been added already.
+ if ( !cns->nseq || prev_pos != ref_pos )
+ {
+ int max_freq = 0, kmax = seq_nt16_int[seq_nt16_table[(int)ref[j]]];
+ for (k=0; k<6; k++)
+ {
+ if ( rcns->cvar[ivar].vtype==snv && rcns->cvar[ivar].which==k ) continue;
+ if ( max_freq < bfreq[j].base[k] ) max_freq = bfreq[j].base[k], kmax = k;
+ }
+ if ( kmax!=BF_DEL )
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = kmax;
+ }
+ }
+ if ( rcns->cvar[ivar].vtype == ins )
+ {
+ int len = ifreq[j].len[which];
+ char *seq = ifreq[j].nt16_seq[which];
+ for (k=0; k<len; k++)
+ {
+ prev_pos = ref_pos;
+ cns->seq[cns->nseq++] = seq_nt16_int[(int)seq[k]];
+ }
+ }
+ else if ( rcns->cvar[ivar].vtype == del ) j += dfreq[j].len[which];
+ }
+}
+
+// The algorithm:
+// 1. Identify heterozygous variant positions
+// 2. Sort variants by abs(variant_allele_freq-0.5) in descending order
+// 3. Take the top sorted variants (up to 8 to fit in uint8_t) and count the number of
+// corresponding reads to create frequency spectrum
+// 4. Correct errors, collapse to the requested number of haplotypes (consensus sequences)
+// using majority vote for the distribution tail
+cns_seq_t *rcns_get_consensus(read_cns_t *rcns, const char *ref)
+{
+ debug_print_base_freqs(rcns, ref);
+
+ select_candidate_variants(rcns, ref);
+ debug_print_candidate_variants(rcns);
+
+ if ( rcns->ncvar )
+ {
+ create_haplotype_frequency_spectrum(rcns);
+ debug_print_haplotype_frequency_spectrum(rcns);
+
+ correct_haplotype_errors(rcns);
+ debug_print_candidate_variants(rcns);
+ debug_print_haplotype_frequency_spectrum(rcns);
+ }
+ else
+ {
+ rcns->cns_hap[0] = 0;
+ rcns->ncns = 1;
+ }
+
+ // create consensus
+ int i;
+ for (i=0; i<rcns->ncns; i++) create_consensus(rcns,ref,i);
+ debug_print_consensus(rcns,ref);
+
+ return rcns->cns;
+}
--- /dev/null
+/* read_consensus.h -- create and maintain consensus of reads
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: pd3@sanger
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE. */
+
+#ifndef READ_CONSENSUS_H
+#define READ_CONSENSUS_H
+
+#include <stdint.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+
+#ifndef DEBUG_RCNS
+#define DEBUG_RCNS 0
+#endif
+
+typedef struct
+{
+ char *seq; // nt5 sequence: "ACGTN"[(int)seq[i]]
+ int nseq, ipos; // the sequence length and the `pos` index relative to seq
+}
+cns_seq_t;
+
+typedef struct _read_cns_t read_cns_t;
+
+// Init and destroy read consensus
+read_cns_t *rcns_init(hts_pos_t pos, hts_pos_t beg, hts_pos_t end);
+void rcns_destroy(read_cns_t *rcns);
+
+// Reset the structures for new sample and/or position
+int rcns_reset(read_cns_t *rcns, hts_pos_t pos, hts_pos_t beg, hts_pos_t end);
+
+// Add reads to consensus. The provided structures must continue to exist
+// until rcns_get_consensus() is called.
+//
+// Todo (easy): allow it to be called once or multiple times, eg for
+// creating a shared consensus for multiple samples
+int rcns_set_reads(read_cns_t *rcns, bam_pileup1_t *plp, int nplp);
+
+// Generate up to two consensus sequences, cns_seq[1].nseq is 0 when only
+// the first is set
+cns_seq_t *rcns_get_consensus(read_cns_t *rcns, const char *ref);
+
+#endif
-/*
+/*
Copyright (C) 2016-2021 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#define SMPL_SINGLE 2 // single sample expected
#define SMPL_PAIR1 4 // two samples expected, the first is from the bcf hdr
#define SMPL_PAIR2 8 // two samples expected, the second is from the bcf hdr
-#define SMPL_VERBOSE 16 // print warnings
+#define SMPL_VERBOSE 16 // print warnings
#define SMPL_REORDER 32 // reorder samples as asked, sample_list[i] points to the VCF header index
typedef struct
}
smpl_ilist_t;
+// Pass NULL for sample_list to get all samples
smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags);
smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags);
void smpl_ilist_destroy(smpl_ilist_t *smpl);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
- int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file
+ int flags = !src ? SMPL_STRICT|SMPL_SINGLE|SMPL_REORDER : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is tab vs vcf annotation file
smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src
if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names);
args->nsmpl_annot = ilist->n;
case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break;
case 1 : args->rename_chrs = optarg; break;
case 2 :
+ if ( args->pair_logic==-1 ) args->pair_logic = 0;
if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF;
else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF;
else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF;
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
- int flags = !src ? SMPL_STRICT|SMPL_SINGLE : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is vcf vs tab annotation file
+ int flags = !src ? SMPL_STRICT|SMPL_SINGLE|SMPL_REORDER : SMPL_STRICT|SMPL_SINGLE|SMPL_PAIR2; // is tab vs vcf annotation file
smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, flags); // gives mapping dst->src
if ( !ilist || !ilist->n ) error("Could not parse the samples: %s\n", args->sample_names);
args->nsmpl_annot = ilist->n;
case 'H': args->header_lines = dbuf_push(args->header_lines,strdup(optarg)); break;
case 1 : args->rename_chrs = optarg; break;
case 2 :
+ if ( args->pair_logic==-1 ) args->pair_logic = 0;
if ( !strcmp(optarg,"snps") ) args->pair_logic |= BCF_SR_PAIR_SNP_REF;
else if ( !strcmp(optarg,"indels") ) args->pair_logic |= BCF_SR_PAIR_INDEL_REF;
else if ( !strcmp(optarg,"both") ) args->pair_logic |= BCF_SR_PAIR_BOTH_REF;
memset(&rec_tgt,0,sizeof(rec_tgt));
regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr);
regitr_t *tmp_itr = regitr_init(args->tgt_idx);
- regitr_copy(tmp_itr, args->tgt_itr);
for (i=0; i<n; i++)
{
+ regitr_copy(tmp_itr, args->tgt_itr);
rec = vcfbuf_peek(args->vcfbuf, i);
int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1;
while ( regitr_overlap(tmp_itr) )
memset(&rec_tgt,0,sizeof(rec_tgt));
regidx_overlap(args->tgt_idx, bcf_seqname(args->aux.hdr,rec0),rec0->pos,rec0->pos,args->tgt_itr);
regitr_t *tmp_itr = regitr_init(args->tgt_idx);
- regitr_copy(tmp_itr, args->tgt_itr);
for (i=0; i<n; i++)
{
+ regitr_copy(tmp_itr, args->tgt_itr);
rec = vcfbuf_peek(args->vcfbuf, i);
int rec_indel = is_indel(rec->n_allele, rec->d.allele) ? 1 : -1;
while ( regitr_overlap(tmp_itr) )
if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
break;
case 'v':
- args->verbose = strtol(optarg, 0, 0);
- error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
+ args->verbose = strtol(optarg, &tmp, 0);
+ if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
break;
case 'h':
case '?': usage(args); break;
if ( args->regions_overlap < 0 ) error("Could not parse: --regions-overlap %s\n",optarg);
break;
case 'v':
- args->verbose = strtol(optarg, 0, 0);
- error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
+ args->verbose = strtol(optarg, &tmp, 0);
+ if ( *tmp || args->verbose<0 || args->verbose>1 ) error("Error: currently only --verbose 0 or --verbose 1 is supported\n");
break;
case 'h':
case '?': usage(args); break;
else if ( var_len==BCF_VL_G )
{
args->maux->nagr_map = bcf_alleles2gt(line->n_allele-1,line->n_allele-1)+1;
- assert( ret==line->n_allele || ret==args->maux->nagr_map );
+ if ( ret!=line->n_allele && ret!=args->maux->nagr_map ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1);
if ( ret==line->n_allele ) // haploid
{
args->maux->nagr_map = line->n_allele;
int k = 0;
for (i=0; i<ma->nals; i++)
if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]);
- assert( k==ma->nout_als );
+ if ( k!=ma->nout_als ) error("Error: could not merge alleles at %s:%"PRId64", sanity check failed: %d!=%d\n",bcf_seqname(out_hdr,out),out->pos+1,k,ma->nout_als);
normalize_alleles(ma->out_als, ma->nout_als);
bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als);
free(al_idxs);
else if ( var_len==BCF_VL_G )
{
args->maux->nagr_map = bcf_alleles2gt(line->n_allele-1,line->n_allele-1)+1;
- assert( ret==line->n_allele || ret==args->maux->nagr_map );
+ if ( ret!=line->n_allele && ret!=args->maux->nagr_map ) error("Wrong number of %s fields at %s:%"PRId64"\n",rule->hdr_tag,bcf_seqname(hdr,line),(int64_t) line->pos+1);
if ( ret==line->n_allele ) // haploid
{
args->maux->nagr_map = line->n_allele;
int k = 0;
for (i=0; i<ma->nals; i++)
if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]);
- assert( k==ma->nout_als );
+ if ( k!=ma->nout_als ) error("Error: could not merge alleles at %s:%"PRId64", sanity check failed: %d!=%d\n",bcf_seqname(out_hdr,out),out->pos+1,k,ma->nout_als);
normalize_alleles(ma->out_als, ma->nout_als);
bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als);
free(al_idxs);
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
abuf_opt_t atomize;
- int use_star_allele;
+ int use_star_allele, ma_use_ref_allele;
char *old_rec_tag;
htsFile *out;
}
for (j=0; j<ngts; j++)
{
if ( gt[j]==bcf_int32_vector_end ) break;
- if ( bcf_gt_is_missing(gt[j]) || bcf_gt_allele(gt[j])==0 ) continue; // missing allele or ref: leave as is
+ if ( bcf_gt_is_missing(gt[j]) ) continue; // missing allele: leave as is
+ if ( (ialt==0 || args->ma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
if ( bcf_gt_allele(gt[j])==ialt+1 )
gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
- else
+ else if ( args->ma_use_ref_allele )
gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF
+ else
+ gt[j] = bcf_gt_missing | bcf_gt_is_phased(gt[j]); // set to missing
}
gt += ngts;
}
}
static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
{
- #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end,set_missing) \
{ \
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
int ntmp = args->ntmp_arr1 / sizeof(type_t); \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
{ \
- dst_vals[0] = src_vals[ialt]; \
+ int idst = 0; \
+ int isrc = ialt; \
+ if ( is_missing || is_vector_end ) set_missing; \
+ else dst_vals[idst] = src_vals[isrc]; \
dst_vals += 1; \
src_vals += nvals; \
} \
{ \
dst_vals[0] = src_vals[0]; \
for (j=1; j<nvals; j++) \
+ { \
+ int isrc = j; \
if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) dst_vals[0] += src_vals[j]; \
- dst_vals[1] = src_vals[ialt+1]; \
+ } \
+ int isrc = ialt + 1; \
+ int idst = 1; \
+ if ( is_vector_end ) set_missing; \
+ else dst_vals[idst] = src_vals[isrc]; \
dst_vals += 2; \
src_vals += nvals; \
} \
for (i=0; i<nsmpl; i++) \
{ \
dst_vals[0] = src_vals[0]; \
- dst_vals[1] = src_vals[ialt+1]; \
+ int isrc = ialt + 1; \
+ int idst = 1; \
+ if ( is_vector_end ) set_missing; \
+ else dst_vals[idst] = src_vals[isrc]; \
dst_vals += 2; \
src_vals += nvals; \
} \
if ( !haploid ) \
{ \
int j; \
- for (j=0; j<nvals; j++) if ( is_vector_end ) break; \
+ for (j=0; j<nvals; j++) \
+ { \
+ int isrc = j; \
+ if ( is_vector_end ) break; \
+ } \
if ( j!=nvals ) haploid = 1; \
} \
dst_vals[0] = src_vals[0]; \
if ( haploid ) \
{ \
dst_vals[1] = src_vals[ialt+1]; \
- if ( !all_haploid ) set_vector_end; \
+ if ( !all_haploid ) { int idst = 2; set_vector_end; } \
} \
else \
{ \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
{
- case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break;
- case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[isrc]==bcf_int32_vector_end, src_vals[isrc]==bcf_int32_missing, dst_vals[idst]=bcf_int32_vector_end, dst_vals[idst]=bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[isrc]), bcf_float_is_missing(src_vals[isrc]), bcf_float_set_vector_end(dst_vals[idst]), bcf_float_set_missing(src_vals[idst])); break;
}
#undef BRANCH_NUMERIC
}
fprintf(stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
fprintf(stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
fprintf(stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
fprintf(stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
args->do_indels = 1;
+ args->ma_use_ref_allele = 1;
args->clevel = -1;
int region_is_file = 0;
int targets_is_file = 0;
{"fasta-ref",required_argument,NULL,'f'},
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
+ {"multi-overlaps",required_argument,NULL,13},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"regions-overlap",required_argument,NULL,1},
else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
break;
case 12 : args->old_rec_tag = optarg; break;
+ case 13 :
+ if ( optarg[0]=='0' ) args->ma_use_ref_allele = 1;
+ else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
+ else error("Invalid argument to --multi-overlaps\n");
+ break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
/* vcfnorm.c -- Left-align and normalize indels.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
int record_cmd_line, force, force_warned, keep_sum_ad;
abuf_t *abuf;
abuf_opt_t atomize;
- int use_star_allele;
+ int use_star_allele, ma_use_ref_allele;
char *old_rec_tag;
htsFile *out;
}
for (j=0; j<ngts; j++)
{
if ( gt[j]==bcf_int32_vector_end ) break;
- if ( bcf_gt_is_missing(gt[j]) || bcf_gt_allele(gt[j])==0 ) continue; // missing allele or ref: leave as is
+ if ( bcf_gt_is_missing(gt[j]) ) continue; // missing allele: leave as is
+ if ( (ialt==0 || args->ma_use_ref_allele) && bcf_gt_allele(gt[j])==0 ) continue; // ref && `--multi-overlaps 0`: leave as is
if ( bcf_gt_allele(gt[j])==ialt+1 )
gt[j] = bcf_gt_unphased(1) | bcf_gt_is_phased(gt[j]); // set to first ALT
- else
+ else if ( args->ma_use_ref_allele )
gt[j] = bcf_gt_unphased(0) | bcf_gt_is_phased(gt[j]); // set to REF
+ else
+ gt[j] = bcf_gt_missing | bcf_gt_is_phased(gt[j]); // set to missing
}
gt += ngts;
}
}
static void split_format_numeric(args_t *args, bcf1_t *src, bcf_fmt_t *fmt, int ialt, bcf1_t *dst)
{
- #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end) \
+ #define BRANCH_NUMERIC(type,type_t,is_vector_end,is_missing,set_vector_end,set_missing) \
{ \
const char *tag = bcf_hdr_int2id(args->hdr,BCF_DT_ID,fmt->id); \
int ntmp = args->ntmp_arr1 / sizeof(type_t); \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
{ \
- dst_vals[0] = src_vals[ialt]; \
+ int idst = 0; \
+ int isrc = ialt; \
+ if ( is_missing || is_vector_end ) set_missing; \
+ else dst_vals[idst] = src_vals[isrc]; \
dst_vals += 1; \
src_vals += nvals; \
} \
{ \
dst_vals[0] = src_vals[0]; \
for (j=1; j<nvals; j++) \
+ { \
+ int isrc = j; \
if ( j!=ialt+1 && !(is_missing) && !(is_vector_end) ) dst_vals[0] += src_vals[j]; \
- dst_vals[1] = src_vals[ialt+1]; \
+ } \
+ int isrc = ialt + 1; \
+ int idst = 1; \
+ if ( is_vector_end ) set_missing; \
+ else dst_vals[idst] = src_vals[isrc]; \
dst_vals += 2; \
src_vals += nvals; \
} \
for (i=0; i<nsmpl; i++) \
{ \
dst_vals[0] = src_vals[0]; \
- dst_vals[1] = src_vals[ialt+1]; \
+ int isrc = ialt + 1; \
+ int idst = 1; \
+ if ( is_vector_end ) set_missing; \
+ else dst_vals[idst] = src_vals[isrc]; \
dst_vals += 2; \
src_vals += nvals; \
} \
if ( !haploid ) \
{ \
int j; \
- for (j=0; j<nvals; j++) if ( is_vector_end ) break; \
+ for (j=0; j<nvals; j++) \
+ { \
+ int isrc = j; \
+ if ( is_vector_end ) break; \
+ } \
if ( j!=nvals ) haploid = 1; \
} \
dst_vals[0] = src_vals[0]; \
if ( haploid ) \
{ \
dst_vals[1] = src_vals[ialt+1]; \
- if ( !all_haploid ) set_vector_end; \
+ if ( !all_haploid ) { int idst = 2; set_vector_end; } \
} \
else \
{ \
}
switch (bcf_hdr_id2type(args->hdr,BCF_HL_FMT,fmt->id))
{
- case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[j]==bcf_int32_vector_end, src_vals[j]==bcf_int32_missing, dst_vals[2]=bcf_int32_vector_end); break;
- case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[j]), bcf_float_is_missing(src_vals[j]), bcf_float_set_vector_end(dst_vals[2])); break;
+ case BCF_HT_INT: BRANCH_NUMERIC(int32, int32_t, src_vals[isrc]==bcf_int32_vector_end, src_vals[isrc]==bcf_int32_missing, dst_vals[idst]=bcf_int32_vector_end, dst_vals[idst]=bcf_int32_missing); break;
+ case BCF_HT_REAL: BRANCH_NUMERIC(float, float, bcf_float_is_vector_end(src_vals[isrc]), bcf_float_is_missing(src_vals[isrc]), bcf_float_set_vector_end(dst_vals[idst]), bcf_float_set_missing(src_vals[idst])); break;
}
#undef BRANCH_NUMERIC
}
fprintf(bcftools_stderr, " --force Try to proceed even if malformed tags are encountered. Experimental, use at your own risk\n");
fprintf(bcftools_stderr, " --keep-sum TAG,.. Keep vector sum constant when splitting multiallelics (see github issue #360)\n");
fprintf(bcftools_stderr, " -m, --multiallelics -|+TYPE Split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
+ fprintf(bcftools_stderr, " --multi-overlaps 0|. Fill in the reference (0) or missing (.) allele when splitting multiallelics [0]\n");
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -N, --do-not-normalize Do not normalize indels (with -m or -c s)\n");
fprintf(bcftools_stderr, " --old-rec-tag STR Annotate modified records with INFO/STR indicating the original variant\n");
args->buf_win = 1000;
args->mrows_collapse = COLLAPSE_BOTH;
args->do_indels = 1;
+ args->ma_use_ref_allele = 1;
args->clevel = -1;
int region_is_file = 0;
int targets_is_file = 0;
{"fasta-ref",required_argument,NULL,'f'},
{"do-not-normalize",no_argument,NULL,'N'},
{"multiallelics",required_argument,NULL,'m'},
+ {"multi-overlaps",required_argument,NULL,13},
{"regions",required_argument,NULL,'r'},
{"regions-file",required_argument,NULL,'R'},
{"regions-overlap",required_argument,NULL,1},
else error("Invalid argument to --atom-overlaps. Perhaps you wanted: \"--atom-overlaps '*'\"?\n");
break;
case 12 : args->old_rec_tag = optarg; break;
+ case 13 :
+ if ( optarg[0]=='0' ) args->ma_use_ref_allele = 1;
+ else if ( optarg[0]=='.' ) args->ma_use_ref_allele = 0;
+ else error("Invalid argument to --multi-overlaps\n");
+ break;
case 'N': args->do_indels = 0; break;
case 'd':
if ( !strcmp("snps",optarg) ) args->rmdup = BCF_SR_PAIR_SNPS;
}
int i,max_convert_unpack = convert_max_unpack(args->convert);
+ int max_filter_unpack = args->filter ? filter_max_unpack(args->filter) : 0;
while ( bcf_sr_next_line(args->files) )
{
if ( !bcf_sr_has_line(args->files,0) ) continue;
if ( pass )
{
if ( !args->smpl_pass ) continue;
- if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
+ if ( !(max_convert_unpack & BCF_UN_FMT) && !(max_filter_unpack & BCF_UN_FMT) ) continue;
pass = 0;
for (i=0; i<line->n_sample; i++)
case 'f': args->format_str = strdup(optarg); break;
case 'H': args->print_header = 1; break;
case 'v': args->vcf_list = optarg; break;
- case 'c':
+ case 'c':
error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n");
break;
case 'a':
}
int i,max_convert_unpack = convert_max_unpack(args->convert);
+ int max_filter_unpack = args->filter ? filter_max_unpack(args->filter) : 0;
while ( bcf_sr_next_line(args->files) )
{
if ( !bcf_sr_has_line(args->files,0) ) continue;
if ( pass )
{
if ( !args->smpl_pass ) continue;
- if ( !(max_convert_unpack & BCF_UN_FMT) ) continue;
+ if ( !(max_convert_unpack & BCF_UN_FMT) && !(max_filter_unpack & BCF_UN_FMT) ) continue;
pass = 0;
for (i=0; i<line->n_sample; i++)
case 'f': args->format_str = strdup(optarg); break;
case 'H': args->print_header = 1; break;
case 'v': args->vcf_list = optarg; break;
- case 'c':
+ case 'c':
error("The --collapse option is obsolete, pipe through `bcftools norm -c` instead.\n");
break;
case 'a':
/* vcfsort.c -- sort subcommand
- Copyright (C) 2017-2021 Genome Research Ltd.
+ Copyright (C) 2017-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
unlink(blk->fname);
free(blk->fname);
}
- if ( blk->rec )
+ if ( blk->rec )
bcf_destroy(blk->rec);
}
rmdir(args->tmp_dir);
int i;
for (i=0; i<a->n_allele; i++)
- {
+ {
if ( i >= b->n_allele ) return 1;
int ret = strcasecmp(a->d.allele[i],b->d.allele[i]);
if ( ret ) return ret;
args->nblk++;
args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
+ if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk);
blk_t *blk = args->blk + args->nblk - 1;
kstring_t str = {0,0,0};
htsFile *fh = hts_open(blk->fname, "wbu");
if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno));
if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
-
+
int i;
for (i=0; i<args->nbuf; i++)
{
bcf_destroy(rec);
}
-void sort_blocks(args_t *args)
+void sort_blocks(args_t *args)
{
htsFile *in = hts_open(args->fname, "r");
if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname);
khp_insert(blk, bhp, &blk);
}
-void merge_blocks(args_t *args)
+void merge_blocks(args_t *args)
{
fprintf(stderr,"Merging %d temporary files\n", (int)args->nblk);
khp_blk_t *bhp = khp_init(blk);
exit(1);
}
-size_t parse_mem_string(const char *str)
+size_t parse_mem_string(const char *str)
{
char *tmp;
double mem = strtod(str, &tmp);
{
args->max_mem *= 0.9;
args->mem_block = malloc(args->max_mem);
+ if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem);
args->mem = 0;
args->tmp_dir = init_tmp_prefix(args->tmp_dir);
/* vcfsort.c -- sort subcommand
- Copyright (C) 2017-2021 Genome Research Ltd.
+ Copyright (C) 2017-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
unlink(blk->fname);
free(blk->fname);
}
- if ( blk->rec )
+ if ( blk->rec )
bcf_destroy(blk->rec);
}
rmdir(args->tmp_dir);
int i;
for (i=0; i<a->n_allele; i++)
- {
+ {
if ( i >= b->n_allele ) return 1;
int ret = strcasecmp(a->d.allele[i],b->d.allele[i]);
if ( ret ) return ret;
args->nblk++;
args->blk = (blk_t*) realloc(args->blk, sizeof(blk_t)*args->nblk);
+ if ( !args->blk ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",sizeof(blk_t)*args->nblk);
blk_t *blk = args->blk + args->nblk - 1;
kstring_t str = {0,0,0};
htsFile *fh = hts_open(blk->fname, "wbu");
if ( fh == NULL ) clean_files_and_throw(args, "Cannot write %s: %s\n", blk->fname, strerror(errno));
if ( bcf_hdr_write(fh, args->hdr)!=0 ) clean_files_and_throw(args, "[%s] Error: cannot write to %s\n", __func__,blk->fname);
-
+
int i;
for (i=0; i<args->nbuf; i++)
{
bcf_destroy(rec);
}
-void sort_blocks(args_t *args)
+void sort_blocks(args_t *args)
{
htsFile *in = hts_open(args->fname, "r");
if ( !in ) clean_files_and_throw(args, "Could not read %s\n", args->fname);
khp_insert(blk, bhp, &blk);
}
-void merge_blocks(args_t *args)
+void merge_blocks(args_t *args)
{
fprintf(bcftools_stderr,"Merging %d temporary files\n", (int)args->nblk);
khp_blk_t *bhp = khp_init(blk);
bcftools_exit(1);
}
-size_t parse_mem_string(const char *str)
+size_t parse_mem_string(const char *str)
{
char *tmp;
double mem = strtod(str, &tmp);
{
args->max_mem *= 0.9;
args->mem_block = malloc(args->max_mem);
+ if ( !args->mem_block ) error("Error: could not allocate %zu bytes of memory, try reducing --max-mem\n",args->max_mem);
args->mem = 0;
args->tmp_dir = init_tmp_prefix(args->tmp_dir);
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2022 Genome Research Ltd.
+ Copyright (C) 2012-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if ( args->split_by_id ) error("Only one file can be given with -i.\n");
}
if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO;
+ else args->files->max_unpack = BCF_UN_FMT;
if ( args->targets_list )
{
bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2022 Genome Research Ltd.
+ Copyright (C) 2012-2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
if ( args->split_by_id ) error("Only one file can be given with -i.\n");
}
if ( !args->samples_list ) args->files->max_unpack = BCF_UN_INFO;
+ else args->files->max_unpack = BCF_UN_FMT;
if ( args->targets_list )
{
bcf_sr_set_opt(args->files,BCF_SR_TARGETS_OVERLAP,targets_overlap);
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
fprintf(stderr, "Subset options:\n");
fprintf(stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
fprintf(stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
- fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n");
+ fprintf(stderr, " when combining filtering with sample subsetting as filtering comes (usually) first.\n");
+ fprintf(stderr, " If unsure, split sample subsetting and filtering in two commands, using -Ou when piping.\n");
fprintf(stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n");
fprintf(stderr, " --force-samples Only warn about unknown subset samples\n");
fprintf(stderr, "\n");
case 'l':
args->clevel = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
- args->output_type |= FT_GZ;
+ args->output_type |= FT_GZ;
break;
case 'o': args->fn_out = optarg; break;
case 'H': args->print_header = 0; break;
args->min_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
break;
- case 'M':
+ case 'M':
args->max_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
break;
/* vcfview.c -- VCF/BCF conversion, view, subset and filter VCF/BCF files.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Shane McCarthy <sm15@sanger.ac.uk>
fprintf(bcftools_stderr, "Subset options:\n");
fprintf(bcftools_stderr, " -a, --trim-alt-alleles Trim ALT alleles not seen in the genotype fields (or their subset with -s/-S)\n");
fprintf(bcftools_stderr, " -I, --no-update Do not (re)calculate INFO fields for the subset (currently INFO/AC and INFO/AN)\n");
- fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix)\n");
+ fprintf(bcftools_stderr, " -s, --samples [^]LIST Comma separated list of samples to include (or exclude with \"^\" prefix). Be careful\n");
+ fprintf(bcftools_stderr, " when combining filtering with sample subsetting as filtering comes (usually) first.\n");
+ fprintf(bcftools_stderr, " If unsure, split sample subsetting and filtering in two commands, using -Ou when piping.\n");
fprintf(bcftools_stderr, " -S, --samples-file [^]FILE File of samples to include (or exclude with \"^\" prefix)\n");
fprintf(bcftools_stderr, " --force-samples Only warn about unknown subset samples\n");
fprintf(bcftools_stderr, "\n");
case 'l':
args->clevel = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg);
- args->output_type |= FT_GZ;
+ args->output_type |= FT_GZ;
break;
case 'o': args->fn_out = optarg; break;
case 'H': args->print_header = 0; break;
args->min_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg);
break;
- case 'M':
+ case 'M':
args->max_alleles = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg);
break;
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.16
+VERSION=1.17
# If we have a git clone, then check against the current tag
if [ -e .git ]
from setuptools.command.build_ext import build_ext
from distutils.extension import Extension
-from distutils.sysconfig import get_config_vars, get_python_version
+from distutils.sysconfig import get_config_var, get_config_vars, get_python_version
from pkg_resources import Distribution
# @loader_path. This will allow Python packages to find the library
# in the expected place, while still giving enough flexibility to
# external applications to link against the library.
- relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
+ relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO'))
library_path = os.path.join(
"@rpath", os.path.basename(relative_module_path)
)
using cython and a high-level, pythonic API for convenient access to
the data within genomic file formats.
-The current version wraps *htslib-1.16*, *samtools-1.16.1*, and *bcftools-1.16*.
+The current version wraps *htslib-1.17*, *samtools-1.17*, and *bcftools-1.17*.
To install the latest release, type::
from setuptools.command.build_ext import build_ext
from distutils.extension import Extension
-from distutils.sysconfig import get_config_vars, get_python_lib, get_python_version
+from distutils.sysconfig import get_config_var, get_config_vars, get_python_lib, get_python_version
from pkg_resources import Distribution
# @loader_path. This will allow Python packages to find the library
# in the expected place, while still giving enough flexibility to
# external applications to link against the library.
- relative_module_path = ext.name.replace(".", os.sep) + get_config_vars()["SO"]
+ relative_module_path = ext.name.replace(".", os.sep) + (get_config_var('EXT_SUFFIX') or get_config_var('SO'))
library_path = os.path.join(
"@rpath", os.path.basename(relative_module_path)
)
--- /dev/null
+[project]
+name = "pysam"
+description = "pysam - a python module for reading, manipulating and writing genomic data sets."
+license = { text = "MIT License" }
+version = "0.21.0"
+authors = [
+ { name = "Andreas Heger", email = "andreas.heger@gmail.com"}
+]
+requires-python = ">=3.6"
+
+dynamic = [
+ "classifiers",
+ "readme",
+]
+
+dependencies = [
+ "cython",
+]
+
+
+[build-system]
+requires = ["setuptools>=59.0", "wheel", "Cython>=0.29.30,<3.0"]
+build-backend = "setuptools.build_meta:__legacy__"
if pysam.config.HTSLIB == "builtin":
pysam_libs.append('libchtslib')
- so = sysconfig.get_config_var('SO')
+ so = sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO')
return [os.path.join(dirname, x + so) for x in pysam_libs]
+try:
+ from typing import Final
+ HAVE_FINAL = True
+except ImportError:
+ HAVE_FINAL = False
+
from pysam.utils import PysamDispatcher
-BCFTOOLS_DISPATCH = [
+_BCFTOOLS_DISPATCH = [
"index",
"annotate",
"concat",
"roh",
"stats"]
-# instantiate bcftools commands as python functions
-for cmd in BCFTOOLS_DISPATCH:
- globals()[cmd] = PysamDispatcher("bcftools", cmd, None)
+
+def _wrap_command(dispatch: str) -> PysamDispatcher:
+ return PysamDispatcher("bcftools", dispatch, ())
+
+
+if not HAVE_FINAL:
+ # instantiate bcftools commands as python functions
+ for cmd in _BCFTOOLS_DISPATCH:
+ globals()[cmd] = PysamDispatcher("bcftools", cmd, None)
+else:
+ # python >=3.8
+ index: Final[PysamDispatcher] = _wrap_command("index")
+ annotate: Final[PysamDispatcher] = _wrap_command("annotate")
+ concat: Final[PysamDispatcher] = _wrap_command("concat")
+ convert: Final[PysamDispatcher] = _wrap_command("convert")
+ isec: Final[PysamDispatcher] = _wrap_command("isec")
+ merge: Final[PysamDispatcher] = _wrap_command("merge")
+ norm: Final[PysamDispatcher] = _wrap_command("norm")
+ plugin: Final[PysamDispatcher] = _wrap_command("plugin")
+ query: Final[PysamDispatcher] = _wrap_command("query")
+ reheader: Final[PysamDispatcher] = _wrap_command("reheader")
+ sort: Final[PysamDispatcher] = _wrap_command("sort")
+ view: Final[PysamDispatcher] = _wrap_command("view")
+ head: Final[PysamDispatcher] = _wrap_command("head")
+ call: Final[PysamDispatcher] = _wrap_command("call")
+ consensus: Final[PysamDispatcher] = _wrap_command("consensus")
+ cnv: Final[PysamDispatcher] = _wrap_command("cnv")
+ csq: Final[PysamDispatcher] = _wrap_command("csq")
+ filter: Final[PysamDispatcher] = _wrap_command("filter")
+ gtcheck: Final[PysamDispatcher] = _wrap_command("gtcheck")
+ mpileup: Final[PysamDispatcher] = _wrap_command("mpileup")
+ roh: Final[PysamDispatcher] = _wrap_command("roh")
+ stats: Final[PysamDispatcher] = _wrap_command("stats")
+# cython: language_level=3
from pysam.libchtslib cimport *
cdef extern from "htslib_util.h":
+# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
###############################################################################
cimport cython
from cpython cimport array as c_array
-from cpython.version cimport PY_MAJOR_VERSION
from cpython cimport PyBytes_FromStringAndSize
from libc.string cimport memset, strchr
from cpython cimport array as c_array
cdef char * htslib_types = 'cCsSiIf'
cdef char * parray_types = 'bBhHiIf'
-cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
-
# translation tables
# cigar code to character and vice versa
cdef char* CODE2CIGAR= "MIDNSHP=XB"
cdef int NCIGAR_CODES = 10
-if IS_PYTHON3:
- CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
- maketrans = str.maketrans
-else:
- CIGAR2CODE = dict([ord(y), x] for x, y in enumerate(CODE2CIGAR))
- maketrans = string.maketrans
-
+CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR))
CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])")
# names for keys in dictionary representation of an AlignedSegment
cdef inline uint8_t strand_mark_char(uint8_t ch, bam1_t *b):
- if ch == '=':
+ if ch == b'=':
if bam_is_rev(b):
- return ','
+ return b','
else:
- return '.'
+ return b'.'
else:
if bam_is_rev(b):
return tolower(ch)
if isinstance(value, int):
if value < 0:
if value >= INT8_MIN:
- typecode = 'c'
+ typecode = b'c'
elif value >= INT16_MIN:
- typecode = 's'
+ typecode = b's'
elif value >= INT32_MIN:
- typecode = 'i'
+ typecode = b'i'
# unsigned ints
else:
if value <= UINT8_MAX:
- typecode = 'C'
+ typecode = b'C'
elif value <= UINT16_MAX:
- typecode = 'S'
+ typecode = b'S'
elif value <= UINT32_MAX:
- typecode = 'I'
+ typecode = b'I'
elif isinstance(value, float):
- typecode = 'f'
+ typecode = b'f'
elif isinstance(value, str):
- typecode = 'Z'
+ typecode = b'Z'
elif isinstance(value, bytes):
- typecode = 'Z'
+ typecode = b'Z'
elif isinstance(value, array.array) or \
isinstance(value, list) or \
isinstance(value, tuple):
- typecode = 'B'
+ typecode = b'B'
else:
if value_type in 'aAsSIcCZidfH':
typecode = force_bytes(value_type)[0]
t = type(value)
if t is float:
- typecode = 'f'
+ typecode = b'f'
elif t is int:
if max_value is None:
max_value = value
# signed ints
if min_value < 0:
if min_value >= INT8_MIN and max_value <= INT8_MAX:
- typecode = 'c'
+ typecode = b'c'
elif min_value >= INT16_MIN and max_value <= INT16_MAX:
- typecode = 's'
+ typecode = b's'
elif min_value >= INT32_MIN or max_value <= INT32_MAX:
- typecode = 'i'
+ typecode = b'i'
else:
raise ValueError(
"at least one signed integer out of range of "
# unsigned ints
else:
if max_value <= UINT8_MAX:
- typecode = 'C'
+ typecode = b'C'
elif max_value <= UINT16_MAX:
- typecode = 'S'
+ typecode = b'S'
elif max_value <= UINT32_MAX:
- typecode = 'I'
+ typecode = b'I'
else:
raise ValueError(
"at least one integer out of range of BAM/SAM specification")
if t is not bytes:
value = value.encode('ascii')
if len(value) == 1:
- typecode = 'A'
+ typecode = b'A'
else:
- typecode = 'Z'
+ typecode = b'Z'
return typecode
typecode = 0
else:
# only first character in valuecode matters
- if IS_PYTHON3:
- typecode = force_bytes(valuetype)[0]
- else:
- typecode = ord(valuetype[0])
+ typecode = force_bytes(valuetype)[0]
pytag = force_bytes(pytag)
pytype = type(value)
# use array.tostring() to retrieve byte representation and
# save as bytes
datafmt = "2sBBI%is" % (len(value) * DATATYPE2FORMAT[typecode][1])
- if IS_PYTHON3:
- args.extend([pytag[:2],
- ord("B"),
- typecode,
- len(value),
- value.tobytes()])
- else:
- args.extend([pytag[:2],
- ord("B"),
- typecode,
- len(value),
- force_bytes(value.tostring())])
+ args.extend([pytag[:2],
+ ord("B"),
+ typecode,
+ len(value),
+ value.tobytes()])
else:
if typecode == 0:
if typecode == 0:
raise ValueError("could not deduce typecode for value {}".format(value))
- if typecode == 'a' or typecode == 'A' or typecode == 'Z' or typecode == 'H':
+ if typecode == b'a' or typecode == b'A' or typecode == b'Z' or typecode == b'H':
value = force_bytes(value)
- if typecode == "a":
- typecode = 'A'
+ if typecode == b"a":
+ typecode = b'A'
- if typecode == 'Z' or typecode == 'H':
+ if typecode == b'Z' or typecode == b'H':
datafmt = "2sB%is" % (len(value)+1)
else:
datafmt = "2sB%s" % DATATYPE2FORMAT[typecode][0]
for k from start <= k < end:
# equivalent to seq_nt16_str[bam1_seqi(s, i)] (see bam.c)
# note: do not use string literal as it will be a python string
- s[k-start] = seq_nt16_str[p[k/2] >> 4 * (1 - k%2) & 0xf]
+ s[k-start] = seq_nt16_str[p[k//2] >> 4 * (1 - k%2) & 0xf]
return charptr_to_bytes(seq)
else:
l += nmatches
nmatches = 0
- if md_tag[md_idx] == '^':
+ if md_tag[md_idx] == b'^':
md_idx += 1
while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
md_idx += 1
s_idx += 1
elif op == BAM_CDEL:
for i from 0 <= i < l:
- s[s_idx] = '-'
+ s[s_idx] = b'-'
s_idx += 1
elif op == BAM_CREF_SKIP:
pass
cdef int insertions = 0
while s[s_idx] != 0:
- if s[s_idx] >= 'a':
+ if s[s_idx] >= b'a':
insertions += 1
s_idx += 1
s_idx = 0
else:
# save matches up to this point, skipping insertions
for x from 0 <= x < nmatches:
- while s[s_idx] >= 'a':
+ while s[s_idx] >= b'a':
s_idx += 1
s_idx += 1
- while s[s_idx] >= 'a':
+ while s[s_idx] >= b'a':
s_idx += 1
r_idx += nmatches
nmatches = 0
- if md_tag[md_idx] == '^':
+ if md_tag[md_idx] == b'^':
md_idx += 1
while md_tag[md_idx] >= 65 and md_tag[md_idx] <= 90:
# assert s[s_idx] == '-'
# save matches up to this point, skipping insertions
for x from 0 <= x < nmatches:
- while s[s_idx] >= 'a':
+ while s[s_idx] >= b'a':
s_idx += 1
s_idx += 1
- while s[s_idx] >= 'a':
+ while s[s_idx] >= b'a':
s_idx += 1
seq = PyBytes_FromStringAndSize(s, s_idx)
cdef uint16_t x = 0
for x from l <= x < l + l_extranul:
- p[x] = '\0'
+ p[x] = b'\0'
property flag:
"""properties flag"""
# as the sequence is stored in half-bytes, the total length (sequence
# plus quality scores) is (l+1)/2 + l
- nbytes_new = (l + 1) / 2 + l
- nbytes_old = (src.core.l_qseq + 1) / 2 + src.core.l_qseq
+ nbytes_new = (l + 1) // 2 + l
+ nbytes_old = (src.core.l_qseq + 1) // 2 + src.core.l_qseq
# acquire pointer to location in memory
p = pysam_bam_get_seq(src)
# convert to C string
s = seq
for k from 0 <= k < l:
- p[k/2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
+ p[k // 2] |= seq_nt16_table[<unsigned char>s[k]] << 4 * (1 - k % 2)
# erase qualities
p = pysam_bam_get_qual(src)
return None
s = force_str(self.query_sequence)
if self.is_reverse:
- s = s.translate(maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
+ s = s.translate(str.maketrans("ACGTacgtNnXx", "TGCAtgcaNnXx"))[::-1]
return s
def get_forward_qualities(self):
value, value_type))
# sam_format1 for typecasting
- if typecode == 'Z':
+ if typecode == b'Z':
value = force_bytes(value)
value_ptr = <uint8_t*><char*>value
value_size = len(value)+1
- elif typecode == 'H':
+ elif typecode == b'H':
# Note that hex tags are stored the very same
# way as Z string.s
value = force_bytes(value)
value_ptr = <uint8_t*><char*>value
value_size = len(value)+1
- elif typecode == 'A' or typecode == 'a':
+ elif typecode == b'A' or typecode == b'a':
value = force_bytes(value)
value_ptr = <uint8_t*><char*>value
value_size = sizeof(char)
- typecode = 'A'
- elif typecode == 'i':
+ typecode = b'A'
+ elif typecode == b'i':
int32_t_value = value
value_ptr = <uint8_t*>&int32_t_value
value_size = sizeof(int32_t)
- elif typecode == 'I':
+ elif typecode == b'I':
uint32_t_value = value
value_ptr = <uint8_t*>&uint32_t_value
value_size = sizeof(uint32_t)
- elif typecode == 's':
+ elif typecode == b's':
int16_t_value = value
value_ptr = <uint8_t*>&int16_t_value
value_size = sizeof(int16_t)
- elif typecode == 'S':
+ elif typecode == b'S':
uint16_t_value = value
value_ptr = <uint8_t*>&uint16_t_value
value_size = sizeof(uint16_t)
- elif typecode == 'c':
+ elif typecode == b'c':
int8_t_value = value
value_ptr = <uint8_t*>&int8_t_value
value_size = sizeof(int8_t)
- elif typecode == 'C':
+ elif typecode == b'C':
uint8_t_value = value
value_ptr = <uint8_t*>&uint8_t_value
value_size = sizeof(uint8_t)
- elif typecode == 'd':
+ elif typecode == b'd':
double_value = value
value_ptr = <uint8_t*>&double_value
value_size = sizeof(double)
- elif typecode == 'f':
+ elif typecode == b'f':
float_value = value
value_ptr = <uint8_t*>&float_value
value_size = sizeof(float)
- elif typecode == 'B':
+ elif typecode == b'B':
# the following goes through python, needs to be cleaned up
# pack array using struct
fmt, args = pack_tags([(tag, value, value_type)])
value = bam_aux2f(v)
elif auxtype == 'A' or auxtype == 'a':
# force A to a
- v[0] = 'A'
+ v[0] = b'A'
# there might a more efficient way
# to convert a char into a string
value = '%c' % <char>bam_aux2A(v)
auxtag[1] = s[1]
s += 2
auxtype = s[0]
- if auxtype in ('c', 'C'):
+ if auxtype in (b'c', b'C'):
value = <int>bam_aux2i(s)
s += 1
- elif auxtype in ('s', 'S'):
+ elif auxtype in (b's', b'S'):
value = <int>bam_aux2i(s)
s += 2
- elif auxtype in ('i', 'I'):
+ elif auxtype in (b'i', b'I'):
value = <int32_t>bam_aux2i(s)
s += 4
- elif auxtype == 'f':
+ elif auxtype == b'f':
value = <float>bam_aux2f(s)
s += 4
- elif auxtype == 'd':
+ elif auxtype == b'd':
value = <double>bam_aux2f(s)
s += 8
- elif auxtype in ('A', 'a'):
+ elif auxtype in (b'A', b'a'):
value = "%c" % <char>bam_aux2A(s)
s += 1
- elif auxtype in ('Z', 'H'):
+ elif auxtype in (b'Z', b'H'):
value = charptr_to_str(<char*>bam_aux2Z(s))
# +1 for NULL terminated string
s += len(value) + 1
- elif auxtype == 'B':
+ elif auxtype == b'B':
s += 1
byte_size, nvalues, value = convert_binary_tag(s)
# 5 for 1 char and 1 int
continue
# see samtools pileup_seq
if mark_ends and p.is_head:
- kputc('^', buf)
+ kputc(b'^', buf)
if p.b.core.qual > 93:
kputc(126, buf)
if p.qpos < p.b.core.l_qseq:
cc = <uint8_t>seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos)]
else:
- cc = 'N'
+ cc = b'N'
if mark_matches and self.reference_sequence != NULL:
rb = self.reference_sequence[self.reference_pos]
if seq_nt16_table[cc] == seq_nt16_table[rb]:
- cc = "="
+ cc = b'='
kputc(strand_mark_char(cc, p.b), buf)
elif add_indels:
if p.is_refskip:
if bam_is_rev(p.b):
- kputc('<', buf)
+ kputc(b'<', buf)
else:
- kputc('>', buf)
+ kputc(b'>', buf)
else:
- kputc('*', buf)
+ kputc(b'*', buf)
if add_indels:
if p.indel > 0:
- kputc('+', buf)
+ kputc(b'+', buf)
kputw(p.indel, buf)
for j from 1 <= j <= p.indel:
cc = seq_nt16_str[bam_seqi(bam_get_seq(p.b), p.qpos + j)]
kputc(strand_mark_char(cc, p.b), buf)
elif p.indel < 0:
- kputc('-', buf)
+ kputc(b'-', buf)
kputw(-p.indel, buf)
for j from 1 <= j <= -p.indel:
# TODO: out-of-range check here?
if self.reference_sequence == NULL:
- cc = 'N'
+ cc = b'N'
else:
cc = self.reference_sequence[self.reference_pos + j]
kputc(strand_mark_char(cc, p.b), buf)
if mark_ends and p.is_tail:
- kputc('$', buf)
+ kputc(b'$', buf)
- kputc(':', buf)
+ kputc(b':', buf)
if buf.l == 0:
# could be zero if all qualities are too low
+# cython: language_level=3
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
from libc.stdlib cimport malloc, calloc, realloc, free
def get_reference_name(self, tid: int) -> Optional[str]: ...
def get_reference_length(self, reference: int) -> int: ...
def is_valid_tid(self, tid: int) -> bool: ...
- def get_tid(self, reference: int) -> int: ...
+ def get_tid(self, reference: str) -> int: ...
class AlignmentFile(HTSFile):
def __init__(
class IteratorRow:
def __iter__(self) -> IteratorRow: ...
- def __next__(self) -> PileupColumn: ...
+ def __next__(self) -> AlignedSegment: ...
class IteratorRowAll(IteratorRow): ...
class IteratorRowAllRefs(IteratorRow): ...
from libc.stdint cimport INT32_MAX
from cpython cimport array as c_array
-from cpython.version cimport PY_MAJOR_VERSION
from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
from pysam.libcutils cimport encode_filename, from_string_and_size
from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
from pysam.libchtslib cimport HTSFile, hisremote
-if PY_MAJOR_VERSION >= 3:
- from io import StringIO
-else:
- from StringIO import StringIO
+from io import StringIO
cimport cython
match_or_deletion = {0, 2, 7, 8} # only M/=/X (0/7/8) and D (2) are related to genome position
for r in read_iterator:
base_position = r.pos
+ cigar = r.cigartuples
+ if cigar is None:
+ continue
- for op, nt in r.cigartuples:
+ for op, nt in cigar:
if op in match_or_deletion:
base_position += nt
elif op == BAM_CREF_SKIP:
if self.htsfile == NULL:
return
- cdef int ret = hts_close(self.htsfile)
- self.htsfile = NULL
-
if self.index != NULL:
hts_idx_destroy(self.index)
self.index = NULL
+ cdef int ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+
self.header = None
if ret < 0:
def __dealloc__(self):
cdef int ret = 0
- if self.htsfile != NULL:
- ret = hts_close(self.htsfile)
- self.htsfile = NULL
-
if self.index != NULL:
hts_idx_destroy(self.index)
self.index = NULL
+ if self.htsfile != NULL:
+ ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+
self.header = None
if self.b:
def __dealloc__(self):
bam_destroy1(self.b)
if self.owns_samfile:
- hts_close(self.htsfile)
hts_idx_destroy(self.index)
+ hts_close(self.htsfile)
cdef class IteratorRowRegion(IteratorRow):
+# cython: language_level=3
###############################################################################
###############################################################################
## Cython wrapper for htslib VCF/BCF reader/writer
+# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
###############################################################################
from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
from cpython.bytes cimport PyBytes_FromStringAndSize
from cpython.unicode cimport PyUnicode_DecodeUTF8
-from cpython.version cimport PY_MAJOR_VERSION
from pysam.libchtslib cimport HTSFile, hisremote
if pystr:
return <object>pystr
- if PY_MAJOR_VERSION < 3:
- val = s
- else:
- val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
+ val = PyUnicode_DecodeUTF8(s, strlen(s), NULL)
PyDict_SetItemString(bcf_str_cache, s, val)
if contig is not None:
rec.contig = contig
- if alleles is not None:
- rec.alleles = alleles
rec.start = start
rec.stop = stop
rec.id = id
rec.qual = qual
+ if alleles is not None:
+ rec.alleles = alleles
+
if filter is not None:
if isinstance(filter, (list, tuple, VariantRecordFilter)):
for f in filter:
+# cython: language_level=3
cdef extern from "bcftools.pysam.h":
int bcftools_dispatch(int argc, char *argv[])
+# cython: language_level=3
"""Functions that read and write block gzipped files.
The user of the file doesn't have to worry about the compression
line.l = line.m = 0
line.s = NULL
- cdef int ret = bgzf_getline(self.bgzf, '\n', &line)
+ cdef int ret = bgzf_getline(self.bgzf, b'\n', &line)
if ret == -1:
s = b''
elif ret == -2:
+# cython: language_level=3
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
from libc.stdlib cimport malloc, calloc, realloc, free
def __getitem__(self, reference: str) -> str: ...
def __contains__(self, reference: str) -> bool: ...
+
+class FastqProxy:
+ @property
+ def name(self) -> str: ...
+ @property
+ def sequence(self) -> str: ...
+ @property
+ def comment(self) -> Optional[str]: ...
+ @property
+ def quality(self) -> Optional[str]: ...
+ def to_string(self) -> str: ...
+ def get_quality_array(self, offset: int = ...) -> Optional[array.array]: ...
+
+
class FastxRecord:
- comment: str = ...
- quality: str = ...
- sequence: str = ...
- name: str = ...
+ comment: Optional[str] = ...
+ quality: Optional[str] = ...
+ sequence: Optional[str] = ...
+ name: Optional[str] = ...
def __init__(
self,
name: Optional[str] = ...,
PyUnicode_Check, \
PyBytes_FromStringAndSize
-from cpython.version cimport PY_MAJOR_VERSION
-
from pysam.libchtslib cimport \
faidx_nseq, fai_load, fai_load3, fai_destroy, fai_fetch, \
faidx_seq_len, faidx_iseq, faidx_seq_len, \
+# cython: language_level=3
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
from libc.stdlib cimport malloc, calloc, realloc, free
+# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
# adds doc-strings for sphinx
+# cython: language_level=3
from pysam.libcalignmentfile cimport AlignedSegment, AlignmentFile
#################################################
PyUnicode_Check, \
PyBytes_FromStringAndSize
-from cpython.version cimport PY_MAJOR_VERSION
-
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
+# cython: language_level=3
cdef extern from "samtools.pysam.h":
int samtools_dispatch(int argc, char *argv[])
+# cython: language_level=3
from libc.stdint cimport int8_t, int16_t, int32_t, int64_t
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t
from libc.stdlib cimport malloc, calloc, realloc, free
+# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
###############################################################################
PyUnicode_Check, PyBytes_FromStringAndSize, \
PyObject_AsFileDescriptor
-from cpython.version cimport PY_MAJOR_VERSION
-
cimport pysam.libctabixproxies as ctabixproxies
from pysam.libchtslib cimport htsFile, hts_open, hts_close, HTS_IDX_START,\
if retval < 0:
break
- if self.buffer.s[0] != '#':
+ if self.buffer.s[0] != b'#':
break
return retval
return charptr_to_str(self.buffer.s, self.encoding)
- def next(self):
- return self.__next__()
-
def __dealloc__(self):
if <void*>self.iterator != NULL:
tbx_itr_destroy(self.iterator)
def __iter__(self):
return self
- def next(self):
- raise StopIteration()
-
def __next__(self):
raise StopIteration()
cdef int retval = 0
while 1:
with nogil:
- retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
+ retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
if retval < 0:
break
cdef int retval = self.__cnext__()
if retval < 0:
raise StopIteration
- if self.buffer.s[0] == '#':
+ if self.buffer.s[0] == b'#':
return self.buffer.s
else:
raise StopIteration
cdef int retval = 0
while 1:
with nogil:
- retval = ks_getuntil(self.kstream, '\n', &self.buffer, &dret)
+ retval = ks_getuntil(self.kstream, b'\n', &self.buffer, &dret)
if retval < 0:
break
b = self.buffer.s
# skip comments
- if (b[0] == '#'):
+ if (b[0] == b'#'):
continue
# skip empty lines
- if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
+ if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
continue
# gzgets terminates at \n, no need to test
def __next__(self):
return self.__cnext__()
- def next(self):
- return self.__cnext__()
-
class tabix_generic_iterator:
'''iterate over ``infile``.
s = force_bytes(line, encoding)
b = s
nbytes = len(line)
- assert b[nbytes] == '\0'
+ assert b[nbytes] == b'\0'
# skip comments
- if b[0] == '#':
+ if b[0] == b'#':
continue
# skip empty lines
- if b[0] == '\0' or b[0] == '\n' or b[0] == '\r':
+ if b[0] == b'\0' or b[0] == b'\n' or b[0] == b'\r':
continue
# make sure that entry is complete
- if b[nbytes-1] != '\n' and b[nbytes-1] != '\r':
+ if b[nbytes-1] != b'\n' and b[nbytes-1] != b'\r':
raise ValueError("incomplete line at %s" % line)
bytes_cpy = <bytes> b
raise StopIteration
- # python version - required for python 2.7
- def next(self):
- return self.__next__()
-
def tabix_iterator(infile, parser):
"""return an iterator over all entries in a file.
:class:`~pysam.asGTF`).
"""
- if PY_MAJOR_VERSION >= 3:
- return tabix_generic_iterator(infile, parser)
- else:
- return tabix_file_iterator(infile, parser)
+ return tabix_generic_iterator(infile, parser)
- # file objects can use C stdio
- # used to be: isinstance( infile, file):
- # if PY_MAJOR_VERSION >= 3:
- # if isinstance( infile, io.IOBase ):
- # return tabix_copy_iterator( infile, parser )
- # else:
- # return tabix_generic_iterator( infile, parser )
- # else:
-# if isinstance( infile, file ):
-# return tabix_copy_iterator( infile, parser )
-# else:
-# return tabix_generic_iterator( infile, parser )
cdef class Tabixfile(TabixFile):
"""Tabixfile is deprecated: use TabixFile instead"""
-#cdef extern from "Python.h":
-# ctypedef struct FILE
+# cython: language_level=3
from libc.stdint cimport uint8_t, int32_t, uint32_t, int64_t, uint64_t
if reset:
for x from 0 <= x < nbytes:
- if self.data[x] == '\0':
- self.data[x] = '\t'
+ if self.data[x] == b'\0':
+ self.data[x] = b'\t'
self.update(self.data, nbytes)
#################################
# remove line breaks and feeds and update number of bytes
x = nbytes - 1
- while x > 0 and (buffer[x] == '\n' or buffer[x] == '\r'):
- buffer[x] = '\0'
+ while x > 0 and (buffer[x] == b'\n' or buffer[x] == b'\r'):
+ buffer[x] = b'\0'
x -= 1
self.nbytes = x + 1
# to guess or dynamically grow
if max_fields == 0:
for x from 0 <= x < nbytes:
- if buffer[x] == '\t':
+ if buffer[x] == b'\t':
max_fields += 1
max_fields += 1
old_pos = pos
while 1:
- pos = <char*>memchr(pos, '\t', nbytes)
+ pos = <char*>memchr(pos, b'\t', nbytes)
if pos == NULL:
break
if field >= max_fields:
"parsing error: more than %i fields in line: %s" %
(max_fields, buffer))
- pos[0] = '\0'
+ pos[0] = b'\0'
pos += 1
self.fields[field] = pos
field += 1
raise ValueError("out of memory")
memcpy(cpy, self.data, self.nbytes+1)
for x from 0 <= x < self.nbytes:
- if cpy[x] == '\0':
- cpy[x] = '\t'
+ if cpy[x] == b'\0':
+ cpy[x] = b'\t'
result = cpy[:self.nbytes]
free(cpy)
r = result.decode(self.encoding)
+# cython: language_level=3
#########################################################################
# Utility functions used across pysam
#########################################################################
+# cython: language_level=3
import types
import sys
import string
for x from 0 <= x < len(qualities):
result[x] = qualities[x] + offset
- if IS_PYTHON3:
- return force_str(result.tobytes())
- else:
- return result.tostring()
+ return force_str(result.tobytes())
cpdef qualities_to_qualitystring(qualities, int offset=33):
## Python 3 compatibility functions
########################################################################
-cdef bint IS_PYTHON3 = PY_MAJOR_VERSION >= 3
-
cdef from_string_and_size(const char* s, size_t length):
- if IS_PYTHON3:
- return s[:length].decode('utf-8', ERROR_HANDLER)
- else:
- return s[:length]
-
+ return s[:length].decode('utf-8', ERROR_HANDLER)
# filename encoding (adapted from lxml.etree.pyx)
cdef str FILENAME_ENCODING = sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii'
"""Make sure a filename is 8-bit encoded (or None)."""
if filename is None:
return None
- elif PY_MAJOR_VERSION >= 3 and PY_MINOR_VERSION >= 2:
- # Added to support path-like objects
- return os.fsencode(filename)
- elif PyBytes_Check(filename):
- return filename
- elif PyUnicode_Check(filename):
- return filename.encode(FILENAME_ENCODING)
- else:
- raise TypeError("Argument must be string or unicode.")
+ return os.fsencode(filename)
cdef bytes force_bytes(object s, encoding=None, errors=None):
cdef charptr_to_str(const char* s, encoding=None, errors=None):
if s == NULL:
return None
- if PY_MAJOR_VERSION < 3:
- return s
- else:
- return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
+ return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
cdef charptr_to_str_w_len(const char* s, size_t n, encoding=None, errors=None):
if s == NULL:
return None
- if PY_MAJOR_VERSION < 3:
- return s[:n]
- else:
- return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
+ return s[:n].decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
cdef bytes charptr_to_bytes(const char* s, encoding=None, errors=None):
(bytes in Py2, unicode in Py3)"""
if s is None:
return None
- if PY_MAJOR_VERSION < 3:
- return s
- elif PyBytes_Check(s):
+ if PyBytes_Check(s):
return s.decode(encoding or TEXT_ENCODING, errors or ERROR_HANDLER)
- else:
- # assume unicode
- return s
+ # assume unicode
+ return s
cdef decode_bytes(bytes s, encoding=None, errors=None):
'''
if method == "index" and args:
- # We make sure that at least 1 input file exists,
+ # We make sure that at least the first specified input file exists,
# and if it doesn't we raise an IOError.
- SIMPLE_FLAGS = ['-c', '--csi', '-f', '--force', '-t', '--tbi', '-n', '--nstats', '-s', '--stats']
- ARGUMENTS = ['-m', '--min-shift', '-o', '--output-file', '--threads', '-@']
+ ARGUMENTS = ['-m', '--min-shift', '-o', '--output', '--output-file', '-@', '--threads']
skip_next = False
for arg in args:
if skip_next:
skip_next = False
continue
- if arg in SIMPLE_FLAGS or (len(arg) > 2 and force_bytes(arg).startswith(b'-@')):
- continue
- if arg in ARGUMENTS:
- skip_next = True
+ if arg.startswith('-'):
+ # Skip next argument for e.g. '--min-shift' '12' or '-m' '12' but not '-m12'
+ if arg in ARGUMENTS:
+ skip_next = True
continue
if not os.path.exists(arg):
raise IOError("No such file or directory: '%s'" % arg)
-
+# cython: language_level=3
+import platform
+from typing import (
+ Callable,
+ List,
+ Tuple,
+ Iterable,
+ Union,
+)
+try:
+ from typing import Final
+ HAVE_FINAL = True
+except ImportError:
+ HAVE_FINAL = False
+
from pysam.utils import PysamDispatcher
+
# samtools command line options to export in python
-SAMTOOLS_DISPATCH = {
+_SAMTOOLS_DISPATCH = {
# samtools 'documented' commands
- "view": ("view", None),
- "head": ("head", None),
- "sort": ("sort", None),
- "mpileup": ("mpileup", None),
- "consensus": ("consensus", None),
- "depth": ("depth", None),
- "faidx": ("faidx", None),
- "fqidx": ("fqidx", None),
- "tview": ("tview", None),
- "index": ("index", None),
- "idxstats": ("idxstats", None),
- "fixmate": ("fixmate", None),
- "flagstat": ("flagstat", None),
- "calmd": ("calmd", None),
- "merge": ("merge", None),
- "markdup": ("markdup", None),
- "rmdup": ("rmdup", None),
- "reference": ("reference", None),
- "reheader": ("reheader", None),
- "cat": ("cat", None),
- "targetcut": ("targetcut", None),
- "phase": ("phase", None),
- "bam2fq": ("bam2fq", None),
- "dict": ("dict", None),
- "addreplacerg": ("addreplacerg", None),
- "pad2unpad": ("pad2unpad", None),
- "depad": ("pad2unpad", None),
- "bedcov": ("bedcov", None),
- "coverage": ("coverage", None),
- "bamshuf": ("bamshuf", None),
- "collate": ("collate", None),
- "stats": ("stats", None),
- "fasta": ("fasta", None),
- "fastq": ("fastq", None),
- "quickcheck": ("quickcheck", None),
- "split": ("split", None),
- "flags": ("flags", None),
- "ampliconclip": ("ampliconclip", None),
- "ampliconstats": ("ampliconstats", None),
- "version": ("version", None),
- "fqimport": ("import", None),
- "samples": ("samples", None),
+ "view": ("view", ()),
+ "head": ("head", ()),
+ "sort": ("sort", ()),
+ "mpileup": ("mpileup", ()),
+ "consensus": ("consensus", ()),
+ "depth": ("depth", ()),
+ "faidx": ("faidx", ()),
+ "fqidx": ("fqidx", ()),
+ "tview": ("tview", ()),
+ "index": ("index", ()),
+ "idxstats": ("idxstats", ()),
+ "fixmate": ("fixmate", ()),
+ "flagstat": ("flagstat", ()),
+ "calmd": ("calmd", ()),
+ "merge": ("merge", ()),
+ "markdup": ("markdup", ()),
+ "rmdup": ("rmdup", ()),
+ "reference": ("reference", ()),
+ "reheader": ("reheader", ()),
+ "reset": ("reset", ()),
+ "cat": ("cat", ()),
+ "targetcut": ("targetcut", ()),
+ "phase": ("phase", ()),
+ "bam2fq": ("bam2fq", ()),
+ "dict": ("dict", ()),
+ "addreplacerg": ("addreplacerg", ()),
+ "pad2unpad": ("pad2unpad", ()),
+ "depad": ("pad2unpad", ()),
+ "bedcov": ("bedcov", ()),
+ "coverage": ("coverage", ()),
+ "bamshuf": ("bamshuf", ()),
+ "collate": ("collate", ()),
+ "stats": ("stats", ()),
+ "fasta": ("fasta", ()),
+ "fastq": ("fastq", ()),
+ "cram_size": ("cram-size", ()),
+ "quickcheck": ("quickcheck", ()),
+ "split": ("split", ()),
+ "flags": ("flags", ()),
+ "ampliconclip": ("ampliconclip", ()),
+ "ampliconstats": ("ampliconstats", ()),
+ "version": ("version", ()),
+ "fqimport": ("import", ()),
+ "import_": ("import", ()),
+ "samples": ("samples", ()),
}
-# instantiate samtools commands as python functions
-for key, options in SAMTOOLS_DISPATCH.items():
- cmd, parser = options
- globals()[key] = PysamDispatcher("samtools", cmd, parser)
-__all__ = list(SAMTOOLS_DISPATCH)
+def _wrap_command(
+ dispatch: str,
+ parsers: Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]],
+) -> PysamDispatcher:
+ return PysamDispatcher("samtools", dispatch, parsers)
+
+
+if not HAVE_FINAL:
+ # python 3.7
+ for key, options in _SAMTOOLS_DISPATCH.items():
+ cmd, parser = options
+ globals()[key] = PysamDispatcher("samtools", cmd, parser)
+
+ __all__ = list(_SAMTOOLS_DISPATCH)
+else:
+ # python >=3.8
+ view: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["view"][0], _SAMTOOLS_DISPATCH["view"][1])
+
+ head: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["head"][0], _SAMTOOLS_DISPATCH["head"][1])
+
+ sort: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["sort"][0], _SAMTOOLS_DISPATCH["sort"][1])
+
+ mpileup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["mpileup"][0], _SAMTOOLS_DISPATCH["mpileup"][1])
+
+ consensus: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["consensus"][0],
+ _SAMTOOLS_DISPATCH["consensus"][1],
+ )
+
+ depth: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["depth"][0], _SAMTOOLS_DISPATCH["depth"][1])
+
+ faidx: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["faidx"][0], _SAMTOOLS_DISPATCH["faidx"][1])
+
+ fqidx: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fqidx"][0], _SAMTOOLS_DISPATCH["fqidx"][1])
+
+ tview: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["tview"][0], _SAMTOOLS_DISPATCH["tview"][1])
+
+ index: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["index"][0], _SAMTOOLS_DISPATCH["index"][1])
+
+ idxstats: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["idxstats"][0], _SAMTOOLS_DISPATCH["idxstats"][1])
+
+ fixmate: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fixmate"][0], _SAMTOOLS_DISPATCH["fixmate"][1])
+
+ flagstat: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["flagstat"][0], _SAMTOOLS_DISPATCH["flagstat"][1])
+
+ calmd: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["calmd"][0], _SAMTOOLS_DISPATCH["calmd"][1])
+
+ merge: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["merge"][0], _SAMTOOLS_DISPATCH["merge"][1])
+
+ markdup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["markdup"][0], _SAMTOOLS_DISPATCH["markdup"][1])
+
+ rmdup: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["rmdup"][0], _SAMTOOLS_DISPATCH["rmdup"][1])
+
+ reference: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["reference"][0],
+ _SAMTOOLS_DISPATCH["reference"][1],
+ )
+
+ reheader: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["reheader"][0], _SAMTOOLS_DISPATCH["reheader"][1])
+
+ reset: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["reset"][0], _SAMTOOLS_DISPATCH["reset"][1])
+
+ cat: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["cat"][0], _SAMTOOLS_DISPATCH["cat"][1])
+
+ targetcut: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["targetcut"][0],
+ _SAMTOOLS_DISPATCH["targetcut"][1],
+ )
+
+ phase: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["phase"][0], _SAMTOOLS_DISPATCH["phase"][1])
+
+ bam2fq: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bam2fq"][0], _SAMTOOLS_DISPATCH["bam2fq"][1])
+
+ dict: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["dict"][0], _SAMTOOLS_DISPATCH["dict"][1])
+
+ addreplacerg: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["addreplacerg"][0],
+ _SAMTOOLS_DISPATCH["addreplacerg"][1],
+ )
+
+ pad2unpad: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["pad2unpad"][0],
+ _SAMTOOLS_DISPATCH["pad2unpad"][1],
+ )
+
+ depad: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["depad"][0], _SAMTOOLS_DISPATCH["depad"][1])
+
+ bedcov: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bedcov"][0], _SAMTOOLS_DISPATCH["bedcov"][1])
+
+ coverage: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["coverage"][0], _SAMTOOLS_DISPATCH["coverage"][1])
+
+ bamshuf: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["bamshuf"][0], _SAMTOOLS_DISPATCH["bamshuf"][1])
+
+ collate: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["collate"][0], _SAMTOOLS_DISPATCH["collate"][1])
+
+ stats: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["stats"][0], _SAMTOOLS_DISPATCH["stats"][1])
+
+ fasta: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fasta"][0], _SAMTOOLS_DISPATCH["fasta"][1])
+
+ fastq: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fastq"][0], _SAMTOOLS_DISPATCH["fastq"][1])
+
+ cram_size: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["cram_size"][0], _SAMTOOLS_DISPATCH["cram_size"][1])
+
+ quickcheck: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["quickcheck"][0],
+ _SAMTOOLS_DISPATCH["quickcheck"][1],
+ )
+
+ split: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["split"][0], _SAMTOOLS_DISPATCH["split"][1])
+
+ flags: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["flags"][0], _SAMTOOLS_DISPATCH["flags"][1])
+
+ ampliconclip: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["ampliconclip"][0],
+ _SAMTOOLS_DISPATCH["ampliconclip"][1],
+ )
+
+ ampliconstats: Final[PysamDispatcher] = _wrap_command(
+ _SAMTOOLS_DISPATCH["ampliconstats"][0],
+ _SAMTOOLS_DISPATCH["ampliconstats"][1],
+ )
+
+ version: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["version"][0], _SAMTOOLS_DISPATCH["version"][1])
+
+ fqimport: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["fqimport"][0], _SAMTOOLS_DISPATCH["fqimport"][1])
+
+ import_: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["import_"][0], _SAMTOOLS_DISPATCH["import_"][1])
+
+ samples: Final[PysamDispatcher] = _wrap_command(_SAMTOOLS_DISPATCH["samples"][0], _SAMTOOLS_DISPATCH["samples"][1])
+from typing import (
+ Callable,
+ List,
+ Tuple,
+ Iterable,
+ Union,
+)
+
from pysam.libcutils import _pysam_dispatch
parsers = None
collection = None
- def __init__(self, collection, dispatch, parsers):
+ def __init__(
+ self,
+ collection: str,
+ dispatch: str,
+ parsers: Iterable[Tuple[str, Callable[[Union[str, List[str]]], Union[str, List[str]]]]],
+ ):
self.collection = collection
self.dispatch = dispatch
self.parsers = parsers
self.stderr = []
- def __call__(self, *args, **kwargs):
- '''execute a samtools command.
+ def __call__(self, *args: str, **kwargs) -> Union[str, List[str]]:
+ '''
+ execute a samtools command.
Keyword arguments:
catch_stdout -- redirect stdout from the samtools command and
// Version information used while compiling samtools, bcftools, and htslib
-#define SAMTOOLS_VERSION "1.16.1 (pysam)"
-#define BCFTOOLS_VERSION "1.16 (pysam)"
-#define HTS_VERSION_TEXT "1.16 (pysam)"
+#define SAMTOOLS_VERSION "1.17 (pysam)"
+#define BCFTOOLS_VERSION "1.17 (pysam)"
+#define HTS_VERSION_TEXT "1.17 (pysam)"
# pysam versioning information
-__version__ = "0.20.0"
+__version__ = "0.21.0"
-__samtools_version__ = "1.16.1"
-__bcftools_version__ = "1.16"
-__htslib_version__ = "1.16"
+__samtools_version__ = "1.17"
+__bcftools_version__ = "1.17"
+__htslib_version__ = "1.17"
The MIT/Expat License
-Copyright (C) 2008-2022 Genome Research Ltd.
+Copyright (C) 2008-2023 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
- cd .../samtools-1.16.1 # Within the unpacked release directory
+ cd .../samtools-1.17 # Within the unpacked release directory
./configure
make
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
- cd .../samtools-1.16.1 # Within the unpacked release directory
+ cd .../samtools-1.17 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
To build with plug-ins, you need to use the --enable-plugins configure option
as follows:
- cd .../samtools-1.16.1 # Within the unpacked release directory
+ cd .../samtools-1.17 # Within the unpacked release directory
./configure --enable-plugins --prefix=/path/to/location
make all all-htslib
make install install-htslib
the source distribution instead of installing the package. In that case
you can use:
- cd .../samtools-1.16.1 # Within the unpacked release directory
- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.16
+ cd .../samtools-1.17 # Within the unpacked release directory
+ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.17
make all all-htslib
It is possible to override the built-in search path using the HTS_PATH
}
int r;
for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
if (!amps[r].ref ||
strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
amps[r].len != sam_hdr_tid2len(header, r)) {
}
int r;
for (r = 0; r < nref; r++) {
+ if (!amps[r].sites)
+ continue;
if (!amps[r].ref ||
strcmp(amps[r].ref, sam_hdr_tid2name(header, r)) != 0 ||
amps[r].len != sam_hdr_tid2len(header, r)) {
typedef struct {
int header;
int flag;
+ int incl_flag;
+ int require_flag;
int min_qual;
int min_mqual;
int min_len;
if (b[i]->core.tid < 0)
continue;
if (b[i]->core.flag & opt->flag)
- continue;
+ continue; // must have none of the flags set
+ if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0)
+ continue; // must have at least one flag set
+ if ((b[i]->core.flag & opt->require_flag) != opt->require_flag)
+ continue; // must have all lags set
if (b[i]->core.qual < opt->min_mqual)
continue;
if (b[i]->core.tid < 0)
continue;
if (b[i]->core.flag & opt->flag)
- continue;
+ continue; // must have none of the flags set
+ if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0)
+ continue; // must have at least one flag set
+ if ((b[i]->core.flag & opt->require_flag) != opt->require_flag)
+ continue; // must have all lags set
if (b[i]->core.qual < opt->min_mqual)
continue;
fprintf(fp, " -b FILE Use bed FILE for list of regions\n");
fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n");
fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n");
- fprintf(fp, " -g INT Remove specified flags from default flag filter\n");
- fprintf(fp, " -G INT Add specified flags to the default flag filter\n");
+ fprintf(fp, " -g INT Remove specified flags from default filter-out flag list\n");
+ fprintf(fp, " -G, --excl-flags FLAGS\n");
+ fprintf(fp, " Add specified flags to the default filter-out flag list\n");
+ fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n");
+ fprintf(fp, " --incl-flags FLAGS\n");
+ fprintf(fp, " Only include records with at least one the FLAGs present [0]\n");
+ fprintf(fp, " --require-flags FLAGS\n");
+ fprintf(fp, " Only include records with all of the FLAGs present [0]\n");
fprintf(fp, " -H Print a file header line\n");
fprintf(fp, " -l INT Minimum read length [0]\n");
fprintf(fp, " -o FILE Write output to FILE [stdout]\n");
" Filter bases with base quality smaller than INT [0]\n");
fprintf(fp, " -Q, --min-MQ INT\n"
" Filter alignments with mapping quality smaller than INT [0]\n");
- fprintf(fp, " -H Print a file header\n");
fprintf(fp, " -J Include reads with deletions in depth computation\n");
fprintf(fp, " -s Do not count overlapping reads within a template\n");
sam_global_opt_help(fp, "-.--.@-.");
char *out_file = NULL;
depth_opt opt = {
.flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
+ .incl_flag = 0,
+ .require_flag = 0,
.min_qual = 0,
.min_mqual = 0,
.skip_del = 1,
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- {"min-MQ", required_argument, NULL, 'Q'},
- {"min-mq", required_argument, NULL, 'Q'},
- {"min-BQ", required_argument, NULL, 'q'},
- {"min-bq", required_argument, NULL, 'q'},
+ {"min-MQ", required_argument, NULL, 'Q'},
+ {"min-mq", required_argument, NULL, 'Q'},
+ {"min-BQ", required_argument, NULL, 'q'},
+ {"min-bq", required_argument, NULL, 'q'},
+ {"excl-flags", required_argument, NULL, 'G'},
+ {"incl-flags", required_argument, NULL, 1},
+ {"require-flags", required_argument, NULL, 2},
SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{NULL, 0, NULL, 0}
};
case 'g':
opt.flag &= ~bam_str2flag(optarg);
break;
- case 'G':
+ case 'G': // reject if any set
opt.flag |= bam_str2flag(optarg);
break;
+ case 1: // reject unless at least one set (0 means ignore option)
+ opt.incl_flag |= bam_str2flag(optarg);
+ break;
+ case 2: // reject unless all set
+ opt.require_flag |= bam_str2flag(optarg);
+ break;
case 'l':
opt.min_len = atoi(optarg);
typedef struct {
int header;
int flag;
+ int incl_flag;
+ int require_flag;
int min_qual;
int min_mqual;
int min_len;
if (b[i]->core.tid < 0)
continue;
if (b[i]->core.flag & opt->flag)
- continue;
+ continue; // must have none of the flags set
+ if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0)
+ continue; // must have at least one flag set
+ if ((b[i]->core.flag & opt->require_flag) != opt->require_flag)
+ continue; // must have all lags set
if (b[i]->core.qual < opt->min_mqual)
continue;
if (b[i]->core.tid < 0)
continue;
if (b[i]->core.flag & opt->flag)
- continue;
+ continue; // must have none of the flags set
+ if (opt->incl_flag && (b[i]->core.flag & opt->incl_flag) == 0)
+ continue; // must have at least one flag set
+ if ((b[i]->core.flag & opt->require_flag) != opt->require_flag)
+ continue; // must have all lags set
if (b[i]->core.qual < opt->min_mqual)
continue;
fprintf(fp, " -b FILE Use bed FILE for list of regions\n");
fprintf(fp, " -f FILE Specify list of input BAM/SAM/CRAM filenames\n");
fprintf(fp, " -X Use custom index files (in -X *.bam *.bam.bai order)\n");
- fprintf(fp, " -g INT Remove specified flags from default flag filter\n");
- fprintf(fp, " -G INT Add specified flags to the default flag filter\n");
+ fprintf(fp, " -g INT Remove specified flags from default filter-out flag list\n");
+ fprintf(fp, " -G, --excl-flags FLAGS\n");
+ fprintf(fp, " Add specified flags to the default filter-out flag list\n");
+ fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n");
+ fprintf(fp, " --incl-flags FLAGS\n");
+ fprintf(fp, " Only include records with at least one the FLAGs present [0]\n");
+ fprintf(fp, " --require-flags FLAGS\n");
+ fprintf(fp, " Only include records with all of the FLAGs present [0]\n");
fprintf(fp, " -H Print a file header line\n");
fprintf(fp, " -l INT Minimum read length [0]\n");
fprintf(fp, " -o FILE Write output to FILE [samtools_stdout]\n");
" Filter bases with base quality smaller than INT [0]\n");
fprintf(fp, " -Q, --min-MQ INT\n"
" Filter alignments with mapping quality smaller than INT [0]\n");
- fprintf(fp, " -H Print a file header\n");
fprintf(fp, " -J Include reads with deletions in depth computation\n");
fprintf(fp, " -s Do not count overlapping reads within a template\n");
sam_global_opt_help(fp, "-.--.@-.");
char *out_file = NULL;
depth_opt opt = {
.flag = BAM_FUNMAP | BAM_FSECONDARY | BAM_FDUP | BAM_FQCFAIL,
+ .incl_flag = 0,
+ .require_flag = 0,
.min_qual = 0,
.min_mqual = 0,
.skip_del = 1,
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- {"min-MQ", required_argument, NULL, 'Q'},
- {"min-mq", required_argument, NULL, 'Q'},
- {"min-BQ", required_argument, NULL, 'q'},
- {"min-bq", required_argument, NULL, 'q'},
+ {"min-MQ", required_argument, NULL, 'Q'},
+ {"min-mq", required_argument, NULL, 'Q'},
+ {"min-BQ", required_argument, NULL, 'q'},
+ {"min-bq", required_argument, NULL, 'q'},
+ {"excl-flags", required_argument, NULL, 'G'},
+ {"incl-flags", required_argument, NULL, 1},
+ {"require-flags", required_argument, NULL, 2},
SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{NULL, 0, NULL, 0}
};
case 'g':
opt.flag &= ~bam_str2flag(optarg);
break;
- case 'G':
+ case 'G': // reject if any set
opt.flag |= bam_str2flag(optarg);
break;
+ case 1: // reject unless at least one set (0 means ignore option)
+ opt.incl_flag |= bam_str2flag(optarg);
+ break;
+ case 2: // reject unless all set
+ opt.require_flag |= bam_str2flag(optarg);
+ break;
case 'l':
opt.min_len = atoi(optarg);
/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
from the 5' end.
- Copyright (C) 2020-2021 Genome Research Ltd.
+ Copyright (C) 2020-2022 Genome Research Ltd.
Authors: Andrew Whitwham <aw7@sanger.ac.uk>
Rob Davies <rmd+git@sanger.ac.uk>
memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
memcpy(rec_out->data, rec->data, rec->core.l_qname);
- if (clipping == hard_clip && bases >= rec->core.l_qseq) {
- rec_out->core.l_qseq = 0;
- rec_out->core.n_cigar = 0;
-
- if (orig_l_aux)
- memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
-
- rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
-
- return 0;
- }
-
// Modify CIGAR
new_cigar = bam_get_cigar(rec_out);
qry_removed += ref_remove;
}
} else {
+ if (clipping == hard_clip) {
+
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+ }
+
qry_removed = rec->core.l_qseq;
}
memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
memcpy(rec_out->data, rec->data, rec->core.l_qname);
- if (clipping == hard_clip && bases >= rec->core.l_qseq) {
- rec_out->core.l_qseq = 0;
- rec_out->core.n_cigar = 0;
-
- if (orig_l_aux)
- memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
-
- rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
- return 0;
- }
-
// Modify CIGAR here
new_cigar = bam_get_cigar(rec_out);
if (qry_removed > 0) j++;
if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
} else {
+ if (clipping == hard_clip) {
+
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+ }
+
qry_removed = rec->core.l_qseq;
j = 0;
if (hardclip > 0 && clipping == soft_clip) j++;
/* bam_ampliconclip.c -- loads amplicon primers from a BED file and cuts reads
from the 5' end.
- Copyright (C) 2020-2021 Genome Research Ltd.
+ Copyright (C) 2020-2022 Genome Research Ltd.
Authors: Andrew Whitwham <aw7@sanger.ac.uk>
Rob Davies <rmd+git@sanger.ac.uk>
memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
memcpy(rec_out->data, rec->data, rec->core.l_qname);
- if (clipping == hard_clip && bases >= rec->core.l_qseq) {
- rec_out->core.l_qseq = 0;
- rec_out->core.n_cigar = 0;
-
- if (orig_l_aux)
- memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
-
- rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
-
- return 0;
- }
-
// Modify CIGAR
new_cigar = bam_get_cigar(rec_out);
qry_removed += ref_remove;
}
} else {
+ if (clipping == hard_clip) {
+
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+ }
+
qry_removed = rec->core.l_qseq;
}
memcpy(&rec_out->core, &rec->core, sizeof(rec->core));
memcpy(rec_out->data, rec->data, rec->core.l_qname);
- if (clipping == hard_clip && bases >= rec->core.l_qseq) {
- rec_out->core.l_qseq = 0;
- rec_out->core.n_cigar = 0;
-
- if (orig_l_aux)
- memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
-
- rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
- return 0;
- }
-
// Modify CIGAR here
new_cigar = bam_get_cigar(rec_out);
if (qry_removed > 0) j++;
if (hardclip > 0 && (clipping == soft_clip || qry_removed == 0)) j++;
} else {
+ if (clipping == hard_clip) {
+
+ rec_out->core.l_qseq = 0;
+ rec_out->core.n_cigar = 0;
+
+ if (orig_l_aux)
+ memcpy(bam_get_aux(rec_out), orig_aux, orig_l_aux);
+
+ rec_out->l_data = bam_get_aux(rec_out) - rec_out->data + orig_l_aux;
+
+ return 0;
+ }
+
qry_removed = rec->core.l_qseq;
j = 0;
if (hardclip > 0 && clipping == soft_clip) j++;
/* bam_consensus.c -- consensus subcommand.
Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source)
- Copyright (C) 2003-2005,2007-2022 Genome Research Ltd.
+ Copyright (C) 2003-2005,2007-2023 Genome Research Ltd.
Author: James Bonfield <jkb@sanger.ac.uk>
// but 30T+ 20T- 18A+ 2A- seems like a consistent A miscall on one strand
// only, while T is spread evenly across both strands.
+// TODO: Phasing of long reads.
+// Long reads offer very strong phasing opportunities for SNPs.
+// From these, we get strong evidence for accuracy of indels.
+// Specifically whether the distribution of poly-len within a phases
+// is significantly different to the distribution of poly len between
+// phases.
+
+// TODO end STR trimming. Eg:
+// REF AAGCTGAAAAGTTAATGTCTTATTTTTTTTTTTTTTTTGAGATGGAGTC
+// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc
+// aagctgaaaagttaatgtcttattttttttt
+// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc
+// Middle seq doesn't validate those initial T alignments.
+// Qual_train solves this by use of the STR trimmer.
+
+// TODO add a weight for proximity to homopolymer.
+// Maybe length/distance? So 3 away from a 12-mer is similar to 1 away
+// from a 4-mer?
+
+// TODO: Count number of base types between this point and the nearest
+// indel or end of read. Eg GATCG<here>AGAGAG*TAGC => 2 (A and G).
+// adj is nbase/4 * score, or (nbase+1)/5?
+// Perhaps multiplied by length too, to get local complexity score?
+
#include <config.h>
#include <stdio.h>
#include <ctype.h>
#include <htslib/sam.h>
+#include <htslib/hfile.h>
#include "samtools.h"
#include "sam_opts.h"
# define MAX(a,b) ((a)>(b)?(a):(b))
#endif
+// Defines for experiment code which is currently disabled
+
+// Hardy-Weinberg statistics to check heterozygous sites match allelic
+// frequencies.
+//#define DO_HDW
+
+// Filter bayesian calls by min-depth and min-fract parameters
+//#define DO_FRACT
+
+// Checks uniqueness of surrounding bases to adjust scores
+//#define K2 2
+
+// Look for strand bias in distribution of homopolymer lengths
+//#define DO_POLY_DIST
+
// Minimum cutoff for storing mod data; => at least 10% chance
#define MOD_CUTOFF 0.46
typedef unsigned char uc;
+// Simple recalibration table for substitutions, undercalls and overcalls.
+// In future, we'll update this to be kmer based too.
+typedef struct {
+ int smap[101]; // substituion or SNP
+ int umap[101]; // undercall or DEL
+ int omap[101]; // overcall or INS
+} qcal_t;
+
typedef struct {
// User options
char *reg;
int min_depth;
double call_fract;
double het_fract;
- int gap5;
+ int mode; // One of MODE_* macros below
enum format fmt;
int cons_cutoff;
int ambig;
int all_bases;
int show_del;
int show_ins;
+ int mark_ins;
int excl_flags;
int incl_flags;
int min_mqual;
double P_het;
+ double P_indel;
+ double het_scale;
+ double homopoly_fix;
+ double homopoly_redux;
+ qcal_t qcal;
// Internal state
samFile *fp;
float discrep;
} consensus_t;
-#define P_HET 1e-4
+#define P_HET 1e-3
+#define P_INDEL 2e-4
+#define P_HOMOPOLY 0.5
+#define P_HET_SCALE 1.0
#define LOG10 2.30258509299404568401
#define TENOVERLOG10 4.34294481903251827652
#define ALIGNED(x)
#endif
-static double prior[25] ALIGNED(16); /* Sum to 1.0 */
-static double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */
-
-/* Precomputed matrices for the consensus algorithm */
-static double pMM[101] ALIGNED(16);
-static double p__[101] ALIGNED(16);
-static double p_M[101] ALIGNED(16);
-
+// Initialised once as a global array. This won't work if threaded,
+// but we'll rewrite if and when that gets added later.
static double e_tab_a[1002] ALIGNED(16);
static double *e_tab = &e_tab_a[500];
static double e_tab2_a[1002] ALIGNED(16);
static double *e_tab2 = &e_tab2_a[500];
static double e_log[501] ALIGNED(16);
+/* Precomputed matrices for the consensus algorithm */
+typedef struct {
+ double prior[25] ALIGNED(16); /* Sum to 1.0 */
+ double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */
+
+ double pMM[101] ALIGNED(16);
+ double p__[101] ALIGNED(16);
+ double p_M[101] ALIGNED(16);
+ double po_[101] ALIGNED(16);
+ double poM[101] ALIGNED(16);
+ double poo[101] ALIGNED(16);
+ double puu[101] ALIGNED(16);
+ double pum[101] ALIGNED(16);
+ double pmm[101] ALIGNED(16);
+
+ // Multiplier on homopolymer length before reducing phred qual
+ double poly_mul;
+} cons_probs;
+
+// Two sets of params; recall oriented (gap5) and precision (stf).
+// We use the former unless MODE_MIXED is set (which is the default
+// for bayesian consensus mode if P_indel is significant).
+static cons_probs cons_prob_recall, cons_prob_precise;
+
/*
* Lots of confusing matrix terms here, so some definitions will help.
*
* The heterozygosity weight though is a per column calculation as we're
* trying to model whether the column is pure or mixed. Hence this is done
* once via a prior and has no affect on the individual matrix cells.
+ *
+ * We have a generic indel probability, but it's a catch all for overcall,
+ * undercall, alignment artifacts, homopolymer issues, etc. So we can set
+ * it considerably higher and just let the QUAL skew do the filtering for
+ * us, albeit no longer well calibrated.
*/
-static void consensus_init(double p_het) {
+// NB: Should _M be MM?
+// Ie sample really is A/C het, and we observe C. That should be a match,
+// not half a match.
+
+#define MODE_SIMPLE 0 // freq counting
+
+#define MODE_BAYES_116 1 // Samtools 1.16 (no indel param)
+#define MODE_RECALL 2 // so called as it's the params from Gap5
+#define MODE_PRECISE 3 // a more precise set; +FN, --FP
+#define MODE_MIXED 4 // Combination of GAP5/BAYES
+
+#define QCAL_FLAT 0
+#define QCAL_HIFI 1
+#define QCAL_HISEQ 2
+#define QCAL_ONT_R10_4_SUP 3
+#define QCAL_ONT_R10_4_DUP 4
+#define QCAL_ULTIMA 5
+
+// Calibration tables here don't necessarily reflect the true accuracy.
+// They have been manually tuned to work in conjunction with other command
+// line parameters used in the machine profiles. For example reducing one
+// qual here and increasing sensitivity elsewhere via another parameter.
+static qcal_t static_qcal[6] = {
+ { // FLAT
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}
+ },
+
+ { // HiFi
+ {10, 11, 11, 12, 13, 14, 15, 16, 18, 19,
+ 20, 21, 22, 23, 24, 25, 27, 28, 29, 30,
+ 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+ 38, 39, 39, 40, 40, 41, 41, 41, 41, 42,
+ 42, 42, 42, 43, 43, 43, 43, 43, 43, 43,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ },
+ { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9,
+ 10, 11, 11, 12, 13, 14, 15, 15, 16, 17,
+ 18, 19, 19, 20, 20, 21, 22, 23, 23, 24,
+ 25, 25, 25, 26, 26, 26, 27, 27, 28, 28,
+ 28, 28, 27, 27, 27, 28, 28, 28, 28, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 26, 26, 25, 26, 26, 27, 27, 27,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27,
+ 28, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+ 27, 28, 28, 30, 30, 30, 30, 30, 30, 30,
+ },
+ { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14,
+ 15, 15, 16, 17, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 23, 23, 23, 24, 24, 24, 25,
+ 25, 25, 25, 25, 25, 26, 26, 26, 26, 27,
+ 27, 27, 27, 27, 27, 28, 28, 28, 28, 28,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ }
+ },
+
+ { // HiSeq
+ { 2, 2, 2, 3, 3, 4, 5, 5, 6, 7,
+ 8, 9, 10, 11, 11, 12, 13, 14, 15, 16,
+ 17, 17, 18, 19, 20, 21, 22, 22, 23, 24,
+ 25, 26, 27, 28, 28, 29, 30, 31, 32, 33,
+ 34, 34, 35, 36, 37, 38, 39, 39, 40, 41,
+ 42, 43, 44, 45, 45, 46, 47, 48, 49, 50,
+ 51, 51, 52, 53, 54, 55, 56, 56, 57, 58,
+ 59, 60, 61, 62, 62, 63, 64, 65, 66, 67,
+ 68, 68, 69, 70, 71, 72, 73, 73, 74, 75,
+ 76, 77, 78, 79, 79, 80, 81, 82, 83, 84,
+ },
+ { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11,
+ 13, 14, 15, 16, 17, 19, 20, 21, 22, 23,
+ 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
+ 37, 38, 39, 40, 41, 43, 44, 45, 46, 47,
+ 49, 50, 51, 52, 53, 55, 56, 57, 58, 59,
+ 61, 62, 63, 64, 65, 67, 68, 69, 70, 71,
+ 73, 74, 75, 76, 77, 79, 80, 81, 82, 83,
+ 85, 86, 87, 88, 89, 91, 92, 93, 94, 95,
+ 97, 98, 99, 100, 101, 103, 104, 105, 106, 107,
+ 109, 110, 111, 112, 113, 115, 116, 117, 118, 119,
+ },
+ { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11,
+ 13, 14, 15, 16, 17, 19, 20, 21, 22, 23,
+ 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
+ 37, 38, 39, 40, 41, 43, 44, 45, 46, 47,
+ 49, 50, 51, 52, 53, 55, 56, 57, 58, 59,
+ 61, 62, 63, 64, 65, 67, 68, 69, 70, 71,
+ 73, 74, 75, 76, 77, 79, 80, 81, 82, 83,
+ 85, 86, 87, 88, 89, 91, 92, 93, 94, 95,
+ 97, 98, 99, 100, 101, 103, 104, 105, 106, 107,
+ 109, 110, 111, 112, 113, 115, 116, 117, 118, 119,
+ }
+ },
+ { // ONT R10.4 super
+ { 0, 2, 2, 2, 3, 4, 4, 5, 6, 7,
+ 7, 8, 9, 12, 13, 14, 15, 15, 16, 17,
+ 18, 19, 20, 22, 24, 25, 26, 27, 28, 29,
+ 30, 31, 33, 34, 36, 37, 38, 38, 39, 39,
+ 40, 40, 40, 40, 40, 40, 40, 41, 40, 40,
+ 41, 41, 40, 40, 40, 40, 41, 40, 40, 40,
+ 40, 41, 41, 40, 40, 41, 40, 40, 39, 41,
+ 40, 41, 40, 40, 41, 41, 41, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+ },
+ { 0, 2, 2, 2, 3, 4, 5, 6, 7, 8,
+ 8, 9, 9, 10, 10, 10, 11, 12, 12, 13,
+ 13, 13, 14, 14, 15, 16, 16, 17, 18, 18,
+ 19, 19, 20, 21, 22, 23, 24, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ },
+ { 0, 4, 6, 6, 6, 7, 7, 8, 9, 9,
+ 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 15, 15, 15, 16, 16, 17, 17, 18, 18, 19,
+ 19, 20, 20, 21, 22, 22, 23, 23, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ }
+ },
+ { // ONT R10.4 duplex; just a copy of hifi for now
+ {10, 11, 11, 12, 13, 14, 15, 16, 18, 19,
+ 20, 21, 22, 23, 24, 25, 27, 28, 29, 30,
+ 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+ 38, 39, 39, 40, 40, 41, 41, 41, 41, 42,
+ 42, 42, 42, 43, 43, 43, 43, 43, 43, 43,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ },
+ { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9,
+ 10, 11, 11, 12, 13, 14, 15, 15, 16, 17,
+ 18, 19, 19, 20, 20, 21, 22, 23, 23, 24,
+ 25, 25, 25, 26, 26, 26, 27, 27, 28, 28,
+ 28, 28, 27, 27, 27, 28, 28, 28, 28, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 26, 26, 25, 26, 26, 27, 27, 27,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27,
+ 28, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+ 27, 28, 28, 30, 30, 30, 30, 30, 30, 30,
+ },
+ { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14,
+ 15, 15, 16, 17, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 23, 23, 23, 24, 24, 24, 25,
+ 25, 25, 25, 25, 25, 26, 26, 26, 26, 27,
+ 27, 27, 27, 27, 27, 28, 28, 28, 28, 28,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ }
+ },
+ { // Ultima Genomics
+ { 2, 2, 3, 4, 5, 6, 6, 7, 8, 9,
+ 10, 10, 11, 12, 13, 14, 14, 15, 16, 17,
+ 18, 18, 19, 21, 22, 23, 23, 24, 25, 26,
+ 27, 27, 28, 29, 30, 31, 31, 32, 33, 34,
+ 35, 35, 36, 37, 38, 39, 39, 40, 42, 43,
+ 44, 44, 45, 46, 47, 48, 48, 49, 50, 51,
+ 52, 52, 53, 54, 55, 56, 56, 57, 58, 59,
+ 60, 60, 61, 63, 64, 65, 65, 66, 67, 68,
+ 69, 69, 70, 71, 72, 73, 73, 74, 75, 76,
+ 77, 77, 78, 79, 80, 81, 81, 82, 84, 85,
+ },
+ { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4,
+ 5, 5, 6, 6, 7, 7, 8, 8, 9, 10,
+ 10, 10, 11, 12, 13, 13, 13, 14, 15, 16,
+ 16, 16, 17, 18, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 22, 22, 23, 23, 24, 24, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ },
+ { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4,
+ 5, 5, 6, 6, 7, 7, 8, 8, 9, 10,
+ 10, 10, 11, 12, 13, 13, 13, 14, 15, 16,
+ 16, 16, 17, 18, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 22, 22, 23, 23, 24, 24, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ }
+ }
+};
+
+int set_qcal(qcal_t *q, int id) {
+ if (id < 0 || id >= sizeof(static_qcal)/sizeof(*static_qcal))
+ return -1;
+
+ memcpy(q, &static_qcal[id], sizeof(*q));
+ return 0;
+}
+
+int load_qcal(qcal_t *q, const char *fn) {
int i;
+ if (strcmp(fn, ":hifi") == 0)
+ return set_qcal(q, QCAL_HIFI);
+ if (strcmp(fn, ":hiseq") == 0)
+ return set_qcal(q, QCAL_HISEQ);
+ if (strcmp(fn, ":r10.4_sup") == 0)
+ return set_qcal(q, QCAL_ONT_R10_4_SUP);
+ if (strcmp(fn, ":r10.4_dup") == 0)
+ return set_qcal(q, QCAL_ONT_R10_4_DUP);
+ if (strcmp(fn, ":ultima") == 0)
+ return set_qcal(q, QCAL_ULTIMA);
+
+ // default
+ for (i = 0; i < 101; i++)
+ q->smap[i] = q->umap[i] = q->omap[i] = i;
+
+ if (strcmp(fn, ":flat") == 0)
+ return 0;
+
+ hFILE *fp = hopen(fn, "r");
+ if (!fp)
+ return -1;
+
+ kstring_t line = KS_INITIALIZE;
+ int max = 0;
+ int last_qual = 0;
+ while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
+ int v, s, u, o;
+ if (*line.s == '#')
+ continue;
+ if (sscanf(line.s, "QUAL %d %d %d %d", &v, &s, &u, &o) != 4)
+ goto err;
+ while (v > last_qual) {
+ q->smap[last_qual+1] = q->smap[last_qual];
+ q->umap[last_qual+1] = q->umap[last_qual];
+ q->omap[last_qual+1] = q->omap[last_qual];
+ last_qual++;
+ }
+ if (v >= 0 && v < 100) {
+ q->smap[v] = s;
+ q->umap[v] = u;
+ q->omap[v] = o;
+ }
+ if (v < max) {
+ fprintf(stderr, "Qual calibration file is not in ascending order\n");
+ return hclose(fp) ? -2 : -1;
+ }
+ max = v;
+ }
+
+ for (i = max+1; i < 101; i++) {
+ q->smap[i] = q->smap[max];
+ q->umap[i] = q->umap[max];
+ q->omap[i] = q->omap[max];
+ }
+
+ ks_free(&line);
+ return hclose(fp) < 0 ? -2 : 0;
+
+ err:
+ ks_free(&line);
+ return hclose(fp) < 0 ? -2 : -1;
+}
+
+static void consensus_init(double p_het, double p_indel, double het_scale,
+ double poly_mul,
+ qcal_t *qcal, int mode, cons_probs *cp) {
+ int i;
+
+ // NB: only need to initialise once, but we do here for now
for (i = -500; i <= 500; i++)
e_tab[i] = exp(i);
for (i = -500; i <= 500; i++)
for (i = 0; i <= 500; i++)
e_log[i] = log(i);
- // Heterozygous locations
+ // EXPERIMENTAL
+ cp->poly_mul = poly_mul;
+
+ // The priors make very little difference, unless shallow data.
+ // ACGT* by ACGT*
+ // So AA=0, CC=6, GG=12, TT=18, **=24
for (i = 0; i < 25; i++)
- prior[i] = p_het / 20;
- prior[0] = prior[6] = prior[12] = prior[18] = prior[24] = (1-p_het)/5;
-
- lprior15[0] = log(prior[0]);
- lprior15[1] = log(prior[1]*2);
- lprior15[2] = log(prior[2]*2);
- lprior15[3] = log(prior[3]*2);
- lprior15[4] = log(prior[4]*2);
- lprior15[5] = log(prior[6]);
- lprior15[6] = log(prior[7]*2);
- lprior15[7] = log(prior[8]*2);
- lprior15[8] = log(prior[9]*2);
- lprior15[9] = log(prior[12]);
- lprior15[10] = log(prior[13]*2);
- lprior15[11] = log(prior[14]*2);
- lprior15[12] = log(prior[18]);
- lprior15[13] = log(prior[19]*2);
- lprior15[14] = log(prior[24]);
-
-
- // Rewrite as new form
+ cp->prior[i] = p_het / 6; // AC AG AT CG CT GT
+
+ // Flat assumption that it is what we observe, and measure everything else
+ // as relative to this.
+ cp->prior[0]=cp->prior[6]=cp->prior[12]=cp->prior[18]=cp->prior[24] = 1;
+
+ // heterozygous deletion
+ for (i = 4; i < 24; i+=5)
+ cp->prior[i] = p_indel / 6; // /6 to be scaled vs p_het equivalently
+
+ // heterozygous insertion
+ for (i = 20; i < 24; i++)
+ cp->prior[i] = p_indel / 6;
+
+ cp->lprior15[0] = log(cp->prior[0]);
+ cp->lprior15[1] = log(cp->prior[1]);
+ cp->lprior15[2] = log(cp->prior[2]);
+ cp->lprior15[3] = log(cp->prior[3]);
+ cp->lprior15[4] = log(cp->prior[4]);
+ cp->lprior15[5] = log(cp->prior[6]);
+ cp->lprior15[6] = log(cp->prior[7]);
+ cp->lprior15[7] = log(cp->prior[8]);
+ cp->lprior15[8] = log(cp->prior[9]);
+ cp->lprior15[9] = log(cp->prior[12]);
+ cp->lprior15[10] = log(cp->prior[13]);
+ cp->lprior15[11] = log(cp->prior[14]);
+ cp->lprior15[12] = log(cp->prior[18]);
+ cp->lprior15[13] = log(cp->prior[19]);
+ cp->lprior15[14] = log(cp->prior[24]);
+
for (i = 1; i < 101; i++) {
- double prob = 1 - pow(10, -i / 10.0);
-
- // May want to multiply all these by 5 so pMM[i] becomes close
- // to -0 for most data. This makes the sums increment very slowly,
- // keeping bit precision in the accumulator.
- pMM[i] = log(prob/5);
- p__[i] = log((1-prob)/20);
- p_M[i] = log((exp(pMM[i]) + exp(p__[i]))/2);
+ double prob = 1 - pow(10, -qcal->smap[i] / 10.0);
+
+ // Or is it that prob is 1-p(subst)-p(overcall)?
+ cp->pMM[i] = log(prob);
+
+ //cp->p__[i] = log(1-prob); // Big help to PB-CCS SNPs; unless fudged
+ cp->p__[i] = log((1-prob)/3); // correct? poor on PB-CCS w/o fudge
+
+ // Mixed alleles; just average two likelihoods
+ cp->p_M[i] = log((exp(cp->pMM[i]) + exp(cp->p__[i]))/2);
+
+ // What does this really mean? Can we simulate this by priors?
+ // It reduces the likelihood of calling het sites, which is
+ // maybe compensation for alignment artifacts? I'm unsure,
+ // but it works (to differing degrees) on both PacBio HiFi and
+ // Illumina HiSeq. It (obviously) loses true hets, but
+ // potentially this can be compensated for by tweaking P-het
+ // (which is entirely in the priors).
+ //
+ // Low het_scale reduces false positives by making hets less
+ // likely to be called. In high depth data we normally have
+ // enough evidence to call correctly even with low het_scale,
+ // so it's a good +FN vs --FP tradeoff. However on low depth
+ // data, het_scale can filter out too many true variants.
+ //
+ // TODO: So consider adjusting at the end maybe?
+ // Also consider never changing calls, but changing their
+ // confidence, so the data is what produces the call with the
+ // parameters skewing the quality score distribution.
+ cp->p_M[i] += log(het_scale);
+
+ if (mode == MODE_BAYES_116) {
+ // Compatibility with samtools 1.16
+
+ // This had no differention for indel vs substitution error rates,
+ // so o(vercall) and u(undercall) are subst(_).
+ cp->pmm[i] = cp->pMM[i];
+ cp->poM[i] = cp->p_M[i];
+ cp->pum[i] = cp->p_M[i];
+ cp->po_[i] = cp->p__[i];
+ cp->poo[i] = cp->p__[i];
+ cp->puu[i] = cp->p__[i];
+
+ } else {
+ // When observing A C G T; leads to insertion calls
+ prob = 1 - pow(10, -qcal->omap[i] / 10.0);
+ // /3 for consistency with ACGT rem as relative likelihoods.
+ // Otherwise with flat priors we end up calling all shallow data
+ // as "*", which is illogical.
+ cp->poo[i] = log((1-prob)/3);
+
+ // Ensure pMM is always more likely. (NB: This shouldn't happen
+ // now with the addition of the /3 step above.)
+ if (cp->poo[i] > cp->pMM[i]-.5)
+ cp->poo[i] = cp->pMM[i]-.5;
+
+ cp->po_[i] = log((exp(cp->poo[i]) + exp(cp->p__[i]))/2);
+ cp->poM[i] = log((exp(cp->poo[i]) + exp(cp->pMM[i]))/2);
+
+ // Overcalls should never be twice as likely than mismatches.
+ // Het bases are mix of _M (other) and MM ops (this).
+ // It's fine for _M to be less likely than oM (more likely
+ // to be overcalled than miscalled), but it should never
+ // be stronger when combined with other mixed data.
+ if (cp->poM[i] > cp->p_M[i]+.5)
+ cp->poM[i] = cp->p_M[i]+.5;
+
+ // Note --low-MQ and --scale-MQ have a big impact on
+ // undercall errs. May need to separate these options per
+ // type, but how?
+ // Multiple-calls, as with mixed mode? This feels like a cheat
+
+ prob = 1 - pow(10, -qcal->umap[i] / 10.0);
+ cp->pmm[i] = log(prob);
+ cp->puu[i] = log((1-prob)/3);
+ if (cp->puu[i] > cp->pMM[i]-.5) // MM is -ve
+ cp->puu[i] = cp->pMM[i]-.5;
+
+ cp->pum[i] = log((exp(cp->puu[i]) + exp(cp->pmm[i]))/2);
+ }
}
- pMM[0] = pMM[1];
- p__[0] = p__[1];
- p_M[0] = p_M[1];
+ cp->pMM[0] = cp->pMM[1];
+ cp->p__[0] = cp->p__[1];
+ cp->p_M[0] = cp->p_M[1];
+
+ cp->pmm[0] = cp->pmm[1];
+ cp->poo[0] = cp->poo[1];
+ cp->po_[0] = cp->po_[1];
+ cp->poM[0] = cp->poM[1];
+ cp->puu[0] = cp->puu[1];
+ cp->pum[0] = cp->pum[1];
}
static inline double fast_exp(double y) {
return indel;
}
+/*
+ * Some machines, including 454 and PacBio, store the quality values in
+ * homopolymers with the first or last base always being the low quality
+ * state. This can cause problems when reverse-complementing and aligning,
+ * especially when we left-justify indels.
+ *
+ * Other platforms take the approach of having the middle bases high and
+ * the low confidence spread evenly to both start and end. This means
+ * reverse-complementing doesn't introduce any strand bias.
+ *
+ * We redistribute qualities within homopolymers in this style to fix
+ * naive consensus or variant calling algorithms.
+ */
+void homopoly_qual_fix(bam1_t *b) {
+ static double ph2err[256] = {0};
+ int i;
+ if (!ph2err[0]) {
+ for (i = 0; i < 256; i++)
+ ph2err[i] = pow(10, i/-10.0);
+ }
+ uint8_t *seq = bam_get_seq(b);
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; i++) {
+ int s = i; // start of homopoly
+ int base = bam_seqi(seq, i);
+ while (i+1 < b->core.l_qseq && bam_seqi(seq, i+1) == base)
+ i++;
+ // s..i inclusive is now homopolymer
+
+ if (s == i)
+ continue;
+
+ // Simplest: reverse if end_qual < start_qual
+ // Next: average outer-most two, then next two, etc
+ // Best: fully redistribute so start/end lower qual than centre
+
+ // Middle route of averaging outer pairs is sufficient?
+ int j, k;
+ for (j = s, k = i; j < k; j++,k--) {
+ double e = ph2err[qual[j]] + ph2err[qual[k]];
+ qual[j] = qual[k] = -fast_log2(e/2)*3.0104+.49;
+ }
+ }
+}
+
// Return the local NM figure within halo (+/- HALO) of pos.
// This local NM is used as a way to modify MAPQ to get a localised MAPQ
// score via an adhoc fashion.
return 0;
pos -= b->core.pos;
if (pos < 0)
- return nm[0];
+ return nm[0] & ((1<<24)-1);
if (pos >= b->core.l_qseq)
- return nm[b->core.l_qseq-1];
+ return nm[b->core.l_qseq-1] & ((1<<24)-1);
+
+ return (nm[pos] & ((1<<24)-1)) / 10.0;
+}
- return nm[pos] / 10.0;
+int poly_len(const pileup_t *p, const bam1_t *b, hts_pos_t pos) {
+ int *nm = (int *)p->cd;
+ if (!nm)
+ return 0;
+ pos -= b->core.pos;
+ if (pos >= 0 && pos < b->core.l_qseq)
+ return nm[pos] >> 24;
+ else
+ return 0;
}
/*
const bam1_t *b = &p->b;
int qlen = b->core.l_qseq, i;
+ if (qlen <= 0)
+ return 0;
int *local_nm = calloc(qlen, sizeof(*local_nm));
if (!local_nm)
return -1;
p->cd = local_nm;
+ double poly_adj = opts->homopoly_fix ? opts->homopoly_fix : 1;
+
if (opts->adj_qual) {
-#if 0
- // Tweak by localised quality.
- // Quality is reduced by a significant portion of the minimum quality
- // in neighbouring bases, on the pretext that if the region is bad, then
- // this base is bad even if it claims otherwise.
+ // Set local_nm based on a function of current qual and the local
+ // minimum qual within the surrounding window.
+ //
+ // Basically if we're in a region of low confidence then we downgrade
+ // higher qual outliers as they may not be as trustworthy as they
+ // claim. This may be because the qualities have been assigned to
+ // the wrong or arbitrary base (very common in homopolymers), or the
+ // surrounding quality (hence also error likelihood) have lead to
+ // misalignments and the base may be contributing to the wrong
+ // pileup column.
+ //
+ // The nm_local() function returns these scores and uses it to bias
+ // the mapping quality, which in turn adjusts base quality.
uint8_t *qual = bam_get_qual(b);
- const int qhalo = 8; // 2?
- int qmin = 50; // effectively caps PacBio qual too
+ uint8_t *seq = bam_get_seq(b);
+ const int qhalo = 8; // window size for base qual
+ int qmin = qual[0]; // min qual within qhalo
+ const int qhalop = 2;// window size for homopolymer qual
+ int qminp = qual[0]; // min qual within homopolymer halo
+ int base = bam_seqi(seq, 0), polyl = 0, polyr = 0; // pos, not len
+
+ // Minimum quality of the initial homopolymer
+ for (i = 1; i < qlen; i++) {
+ if (bam_seqi(seq, i) != base)
+ break;
+ if (i < qhalop && qminp > qual[i])
+ qminp = qual[i];
+ }
+
+ // Minimum quality for general bases
for (i = 0; i < qlen && i < qhalo; i++) {
- local_nm[i] = qual[i];
if (qmin > qual[i])
qmin = qual[i];
}
+
for (;i < qlen-qhalo; i++) {
- //int t = (qual[i]*1 + 3*qmin)/4; // good on 60x
- int t = (qual[i] + 5*qmin)/4; // good on 15x
- local_nm[i] = t < qual[i] ? t : qual[i];
- if (qmin > qual[i+qhalo])
- qmin = qual[i+qhalo];
- else if (qmin <= qual[i-qhalo]) {
+ if (opts->homopoly_fix && bam_seqi(seq, i) != base) {
+ polyl = i;
+ base = bam_seqi(seq, i);
+ qminp = qual[i];
int j;
- qmin = 50;
- for (j = i-qhalo+1; j <= i+qhalo; j++)
- if (qmin > qual[j])
- qmin = qual[j];
+ for (j = i+1; j < qlen; j++) {
+ if (bam_seqi(seq, j) != base)
+ break;
+ if (i < qhalop && qminp > qual[j])
+ qminp = qual[j];
+ }
+ polyr = j-1;
+ } else {
+ // CHECK: do we want to have opts->homopoly_fix above,
+ // so when not in use we don't define pl to non-zero?
+ // Test on SynDip
+ polyr = polyl;
}
- }
- for (; i < qlen; i++) {
- local_nm[i] = qual[i];
- local_nm[i] = (local_nm[i] + 6*qmin)/4;
- }
+ int pl = polyr-polyl;
- for (i = 0; i < qlen; i++) {
- qual[i] = local_nm[i];
+ // Useful for SNPS, as we're judging the variation in
+ // length as an indicator for base-misalignment.
+ // Not so useful for indel calling where the longer the indel
+ // the less confident we are on the len variation being real.
+ int t = (opts->mode == MODE_BAYES_116)
+ ? (qual[i] + 5*qmin)/4
+ : qual[i]/3 + (qminp-pl*2)*poly_adj;
- // Plus overall rescale.
- // Lower becomes lower, very high becomes a little higher.
- // Helps deep GIAB, but detrimental elsewhere. (What this really
- // indicates is quality calibration differs per data set.)
- // It's probably something best accounted for somewhere else.
- //qual[i] = qual[i]*qual[i]/40+1;
- }
- memset(local_nm, 0, qlen * sizeof(*local_nm));
-#else
- // Skew local NM by qual vs min-qual delta
- uint8_t *qual = bam_get_qual(b);
- const int qhalo = 8; // 4
- int qmin = 99;
- for (i = 0; i < qlen && i < qhalo; i++) {
- if (qmin > qual[i])
- qmin = qual[i];
- }
- for (;i < qlen-qhalo; i++) {
- int t = (qual[i] + 5*qmin)/4; // good on 15x
- local_nm[i] += t < qual[i] ? (qual[i]-t) : 0;
+ local_nm[i] += t < qual[i] ? qual[i]-t : 0;
+
+ // Brute force qminp in polyl to polyr range.
+ // TODO: optimise this with sliding window
+ qminp = qual[i];
+ int k;
+ for (k = MAX(polyl,i-qhalop); k <= MIN(polyr,i+qhalop); k++)
+ if (qminp > qual[k])
+ qminp = qual[k];
+
if (qmin > qual[i+qhalo])
qmin = qual[i+qhalo];
else if (qmin <= qual[i-qhalo]) {
}
}
for (; i < qlen; i++) {
- int t = (qual[i] + 5*qmin)/4; // good on 15x
- local_nm[i] += t < qual[i] ? (qual[i]-t) : 0;
+ int t = (opts->mode == MODE_BAYES_116)
+ ? (qual[i] + 5*qmin)/4
+ : qual[i]/3 + qminp*poly_adj;
+ local_nm[i] += t < qual[i] ? qual[i]-t : 0;
}
-#endif
+ }
+
+ // Fix e.g. PacBio homopolymer qualities
+ if (opts->homopoly_fix)
+ homopoly_qual_fix((bam1_t *)b);
+
+ // local_nm[i] & ((1<<24)-1) is for SNP score adjustment.
+ // We also put some more basic poly-X len in local_nm[i] >> 24.
+ uint8_t *seq = bam_get_seq(b);
+ for (i = 0; i < qlen; i++) {
+ int base = bam_seqi(seq, i);
+ int poly = 0, j, k;
+ for (j = i+1; j < qlen; j++)
+ if (bam_seqi(seq, j) != base)
+ break;
+ //printf("%d x %d\n", base, j-i);
+
+ poly = j-i-1; if (poly > 100) poly = 100;
+ const int HALO=0;
+ for (k = i-HALO; k < j+HALO; k++)
+ if (k >= 0 && k < qlen)
+ local_nm[k] = ((MAX(poly, local_nm[k]>>24))<<24)
+ | (local_nm[k] & ((1<<24)-1));
+
+ i = j-1;
}
// Adjust local_nm array by the number of edits within
}
// substitution
- for (i = pos-halo*2 >= 0 ? pos-halo*2 : 0; i < pos-halo; i++)
+ for (i = pos-halo*2 >= 0 ?pos-halo*2 :0; i < pos-halo && i < qlen; i++)
local_nm[i]+=5;
for (; i < pos+halo && i < qlen; i++)
local_nm[i]+=10;
return 1;
}
+void nm_free(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) {
+ free(p->cd);
+ p->cd = NULL;
+}
+
+#ifdef DO_HDW
+/*
+ * Stirling's formula with a 1/12n correction applied to improve accuracy.
+ * This seems to hold remarkably true for both low and high numbers too.
+ */
+double lnfact(double n) {
+ /* Or Gosper's formula...
+ * return (n*ln(n) - n + ln(2*M_PI*n + M_PI/3) / 2);
+ */
+ return ((n+0.5)*log(n) - n + log(2*M_PI)/2) + log(1 + 1/(12.0*n));
+ /* + log(1 + 1/(288.0*n*n)); */
+}
+
+/*
+ * The binomical coefficient (n,k) for n trials with k successes where
+ * prob(success) = p.
+ * k n-k
+ * P (k|n) = n! / (k! (n-k)!) p (1-p)
+ * p
+ *
+ * The coefficient we are returning here is the n! / (k! (n-k)!) bit.
+ * We compute it using ln(n!) and then exp() the result back to avoid
+ * excessively large numbers.
+ */
+double bincoef(int n, double k) {
+ return exp(lnfact(n) - lnfact(k) - lnfact(n-k));
+}
+
+/*
+ * Given p == 0.5 the binomial expansion simplifies a bit, so we have
+ * a dedicated function for this.
+ */
+double binprobhalf(int n, double k) {
+ return bincoef(n, k) * pow(0.5, n);
+}
+
+double lnbinprobhalf(int n, double k) {
+ // ln(binprobhalf) expanded up and simplified
+ return lnfact(n) - lnfact(k) - lnfact(n-k) - 0.69315*n;
+}
+#endif
static
int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth,
pileup_t *plp, consensus_opts *opts,
- consensus_t *cons, int default_qual) {
+ consensus_t *cons, int default_qual,
+ cons_probs *cp) {
int i, j;
static int init_done =0;
static double q2p[101], mqual_pow[256];
// if it's rare.
// Helps a bit on deep data, especially with K2=3, but detrimental on
// shallow and (currently) quite a slow down.
-
-//#define K2 2
#ifdef K2
int hashN[1<<(K2*4+2)] = {0};
int hash1[1<<2] = {0};
if (!init_done) {
init_done = 1;
- consensus_init(opts->P_het);
for (i = 0; i <= 100; i++) {
q2p[i] = pow(10, -i/10.0);
/* Initialise */
int counts[6] = {0};
+#ifdef DO_FRACT
+ int counts2[2][6] = {{0}};
+#endif
/* Accumulate */
int td = depth; // original depth
depth = 0;
+#ifdef DO_POLY_DIST
+ int poly_dist[2][100] = {0};
+#endif
for (; plp; plp = plp->next) {
pileup_t *p = plp;
int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _;
hb = (hb<<2)|base;
}
- // fprintf(stderr, "%c: %d %d of %d\t%d %d\n", p->base, hashN[hb], hash1[base1], td, p->qual, p->qual * hashN[hb] / hash1[base1]);
#undef _
#endif
// convert from sam base to acgt*n order.
base = L[base];
- double MM, __, _M, qe;
+ double MM, __, _M, oo, oM, o_, uu, um, mm, qe;
// Correction for mapping quality. Maybe speed up via lookups?
// Cannot nullify mapping quality completely. Lots of (true)
if (flags & CONS_MQUAL) {
int mqual = b->core.qual;
if (opts->nm_adjust) {
- mqual /= (nm_local(p, b, pos)+1);
+ //mqual /= (nm_local(p, b, pos)+1);
+ mqual /= (nm_local(p, b, b->core.pos + p->seq_offset+1)+1);
mqual *= 1 + 2*(0.5-(td>30?30:td)/60.0); // depth fudge
}
if (qual < 1)
qual = 1;
- __ = p__[qual]; // neither match
- MM = pMM[qual] - __; // both match
- _M = p_M[qual] - __; // one allele only (half match)
+ double poly = poly_len(p, b, b->core.pos + p->seq_offset+1);
+#ifdef DO_POLY_DIST
+ poly_dist[bam_is_rev(b)][MIN(99,(int)poly)]++;
+#endif
+
+ // EXPERIMENTAL
+ // Adjust qual based on homopolymer length.
+ // Affects different platforms by differing amounts.
+ // May wish to further separate to qual2 and qual3 for ins and del?
+ int qual2 = MAX(1, qual-(poly-2)*cp->poly_mul);
+
+ /* MM=match _M=half-match __=mismatch */
+ __ = cp->p__[qual]; // neither match
+ MM = cp->pMM[qual] - __; // both match
+ _M = cp->p_M[qual] - __; // one allele only (half match)
+
+ /* observation ACGT, but against hypothesis ** or *base */
+ oo = cp->poo[qual2] - __;
+ oM = cp->poM[qual2] - __;
+ o_ = cp->po_[qual2] - __;
+
+ /* observation * */
+ uu = cp->puu[qual2] - __;
+ um = cp->pum[qual2] - __;
+ mm = cp->pmm[qual2] - __;
if (flags & CONS_DISCREP) {
qe = q2p[qual];
sumsC[base] += 1 - qe;
}
+
counts[base]++;
+#ifdef DO_FRACT
+ counts2[bam_is_rev(b)][base]++;
+#endif
+
+ // oM should never be higher than _M for actual bases! or...
+ //printf("base %d@%d MM %f _M %f oM %f\n", base, qual, MM, _M, oM);
switch (base) {
case 0: // A
- S[0] += MM;
- S[1] += _M;
- S[2] += _M;
- S[3] += _M;
- S[4] += _M;
+ S[0] += MM;
+ S[1] += _M;
+ S[2] += _M;
+ S[3] += _M;
+ S[4] += oM;
+ S[8] += o_;
+ S[11] += o_;
+ S[13] += o_;
+ S[14] += oo;
break;
case 1: // C
- S[1] += _M;
- S[5] += MM;
- S[6] += _M;
- S[7] += _M;
- S[8] += _M;
+ S[1] += _M;
+ S[5] += MM;
+ S[6] += _M;
+ S[7] += _M;
+ S[8] += oM;
+ S[4] += o_;
+ S[11] += o_;
+ S[13] += o_;
+ S[14] += oo;
+
+ //fprintf(stderr, "%d %f %f %f\n", qual, MM+__, oo+__, MM-oo);
break;
case 2: // G
S[ 6] += _M;
S[ 9] += MM;
S[10] += _M;
- S[11] += _M;
+ S[11] += oM;
+ S[4] += o_;
+ S[8] += o_;
+ S[13] += o_;
+ S[14] += oo;
break;
case 3: // T
- S[ 3] += _M;
+ S[ 3] += _M; // _m
S[ 7] += _M;
S[10] += _M;
- S[12] += MM;
- S[13] += _M;
+ S[12] += MM; // mm
+ S[13] += oM;
+ S[4] += o_;
+ S[8] += o_;
+ S[11] += o_;
+ S[14] += oo;
+ // S[14] oo
break;
case 4: // *
- S[ 4] += _M;
- S[ 8] += _M;
- S[11] += _M;
- S[13] += _M;
- S[14] += MM;
+ // under under under under agree-no-base
+ S[0] += uu; S[1 ]+= uu; S[2 ]+= uu; S[3 ]+= uu; S[4 ]+= um;
+ S[5 ]+= uu; S[6 ]+= uu; S[7 ]+= uu; S[8 ]+= um;
+ S[9 ]+= uu; S[10]+= uu; S[11]+= um;
+ S[12]+= uu; S[13]+= um;
+ S[14]+= mm;
break;
case 5: /* N => equal weight to all A,C,G,T but not a pad */
- S[ 0] += MM;
- S[ 1] += MM;
- S[ 2] += MM;
- S[ 3] += MM;
- S[ 4] += _M;
-
- S[ 5] += MM;
- S[ 6] += MM;
- S[ 7] += MM;
- S[ 8] += _M;
-
- S[ 9] += MM;
- S[10] += MM;
- S[11] += _M;
-
- S[12] += MM;
- S[13] += _M;
+ S[0] += MM; S[1 ]+= MM; S[2 ]+= MM; S[3 ]+= MM; S[4 ]+= oM;
+ S[5 ]+= MM; S[6 ]+= MM; S[7 ]+= MM; S[8 ]+= oM;
+ S[9 ]+= MM; S[10]+= MM; S[11]+= oM;
+ S[12]+= MM; S[13]+= oM;
+ S[14]+= oo;
break;
}
depth++;
+ }
+
+#ifdef DO_POLY_DIST
+ // Or compute mean and s.d per strand.
+ // Then compare likelihood of strands coming from the same distribution?
+ // eg s.d=0.59 vs mean=3.41 sd=0.54... hmm
+ //
+ // Or compare ratio of most frequent to next most frequent, for each
+ // strand.
+
+ int d1 = 0, d2 = 0;
+ double nd1 = 0, nd2 = 0;
+ int k;
+ for (k = 0; k < 100; k++) {
+ if (!poly_dist[0][k] && !poly_dist[1][k])
+ continue;
- if (p->eof && p->cd) {
- free(p->cd);
- p->cd = NULL;
+// fprintf(stdout, "%ld %d %2d %2d\n", pos, k, poly_dist[0][k], poly_dist[1][k]);
+ d1 += (k+1)*poly_dist[0][k];
+ d2 += (k+1)*poly_dist[1][k];
+ nd1 += poly_dist[0][k];
+ nd2 += poly_dist[1][k];
+ }
+// printf("Avg = %f / %f %f / %f / %f\n",
+// (d1+d2+1)/(nd1+nd2+1.),
+// (d1+1)/(nd1+1.), (d2+1)/(nd2+1.),
+// (d2+1)/(nd2+1.) - (d1+1)/(nd1+1.),
+// ((d2+1)/(nd2+1.) - (d1+1)/(nd1+1.)) / ((d1+d2+1)/(nd1+nd2+1.)));
+
+ // Find the top two frequent lengths
+ int n1 = 0, n2 = 0, l1 = 0, l2 = 0;
+ for (k = 0; k < 100; k++) {
+ int poly12 = poly_dist[0][k]+poly_dist[1][k];
+ if (n1 < poly12) {
+ n2 = n1; l2 = l1;
+ n1 = poly12;
+ l1 = k;
+ } else if (n2 < poly12) {
+ n2 = poly12;
+ l2 = k;
}
}
+ const double N = 5;
+ nd1 += 1;
+ nd2 += 1;
+
+ // l1 is most common length
+ int pn1p = poly_dist[0][l1];
+ int pn1m = poly_dist[1][l1];
+ // l2 2nd most common
+ int pn2p = poly_dist[0][l2];
+ int pn2m = poly_dist[1][l2];
+
+ // ratio if two most common lengths on +
+ double s1 = (pn1p+N) / (pn2p+N); s1 = s1>1?1/s1:s1;
+ // ratio if two most common lengths on -
+ double s2 = (pn1m+N) / (pn2m+N); s2 = s2>1?1/s2:s2;
+
+ // ratio of s1 and s2 to identify strand bias
+ double sbias = s1 / s2; sbias = sbias>1?1/sbias:sbias;
+
+ if (pn2p+pn2m > 0 && l1 != l2) {
+// printf("len %d,%d + %d,%d - %d,%d\tbias = %f %f, %f %f\t%ld\n",
+// l1, l2, pn1p, pn2p, pn1m, pn2m,
+// s1, s2, sbias, sqrt(sbias)-1, pos);
+
+ // adjust score for het indels
+ // sbias is close to 0 for strong strand bias, and 1 for none
+ sbias = 10*log(sbias);//+.5);
+ S[ 4] += sbias; // A*
+ S[ 8] += sbias; // C*
+ S[11] += sbias; // G*
+ S[13] += sbias; // T*
+ } else {
+ sbias = 0;
+ }
+#endif
+
/* We've accumulated stats, so now we speculate on the consensus call */
double shift, max, max_het, norm[15];
int call = 0, het_call = 0;
max = -DBL_MAX;
max_het = -DBL_MAX;
+#ifdef DO_FRACT
+ // Filter by --min-depth and --het-fract.
+ // Also add a slight adjustment for strand bias.
+ for (j = 0; j < 15; j++) {
+ if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14)
+ continue;
+
+ double c1p = counts2[0][map_het[j]%5];
+ double c1m = counts2[1][map_het[j]%5];
+ double c2p = counts2[0][map_het[j]/5];
+ double c2m = counts2[1][map_het[j]/5];
+
+ double c1 = c1p + c1m;
+ double c2 = c2p + c2m;
+
+ if (c1 && c2) {
+ // Slight decrease in confidence if strong strand bias.
+ const int N = 10; // avoid low sample size problems
+ double b1 = 1 - (N+MIN(c1p,c1m))/(N+MAX(c1p,c1m));
+ double b2 = 1 - (N+MIN(c2p,c2m))/(N+MAX(c2p,c2m));
+ if (b1 > 0.5) S[j] -= b1;
+ if (b2 > 0.5) S[j] -= b2;
+
+ // Fraction based filtering, via --min-depth and --het-fract opts.
+ c1 += 1e-5;
+ c2 += 1e-5;
+ if (c2 > c1) {
+ double tmp = c2;
+ c2 = c1;
+ c1 = tmp;
+ }
+
+ if (c2 < opts->min_depth)
+ S[j] -= 100;
+ if (c2 / (c1+1e-5) <= opts->het_fract)
+ S[j] -= 100;
+ }
+ }
+#endif
+
+#ifdef DO_HDW
+ /*
+ * Apply Hardy-Weinberg statistics for heterozygous sites.
+ * This helps, but it also loses sensitivity a little.
+ */
for (j = 0; j < 15; j++) {
- S[j] += lprior15[j];
+ if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14)
+ continue;
+
+ double c1 = counts[map_het[j]%5];
+ double c2 = counts[map_het[j]/5];
+
+ if (c1 && c2) {
+ c1 += 1e-5;
+ c2 += 1e-5;
+ if (c2 > c1) {
+ double tmp = c2;
+ c2 = c1;
+ c1 = tmp;
+ }
+
+ // Limit depth for HW as we'll have an allele freq difference,
+ // even if it's just caused by alignment reference bias.
+ double c12 = c1+c2;
+ if (c12 > 20) {
+ c2 *= 20/(c12);
+ c12 = 20;
+ c1 = 20-c2;
+ }
+
+ // Helps a little, especially reducing FN deletions.
+ c1+=1;
+ c2+=1;
+ c12+=2;
+ S[j] += lnbinprobhalf(c12, c2) + fast_log2(c12)*0.69+.2;
+ }
+ }
+#endif
+
+ for (j = 0; j < 15; j++) {
+ S[j] += cp->lprior15[j];
if (shift < S[j])
shift = S[j];
return 0;
}
+// If opts->gap5 is MODE_MIXED then we use two different parameter
+// sets, favouring cp_p for precision and cp_r for recall. Otherwise it's
+// always cp_r only.
+//
+// When both calls equal, we return the same result. When they differ,
+// we adjust qual based on accurate vs recall profiles.
+int calculate_consensus_gap5m(hts_pos_t pos, int flags, int depth,
+ pileup_t *plp, consensus_opts *opts,
+ consensus_t *cons, int default_qual,
+ cons_probs *cp_r, cons_probs *cp_p) {
+ if (opts->mode != MODE_MIXED)
+ return calculate_consensus_gap5(pos, flags, depth, plp, opts,
+ cons, default_qual,
+ opts->mode == MODE_PRECISE
+ ? cp_p : cp_r);
+
+ // EXPERIMENTAL: mixed mode
+ consensus_t consP, consR;
+ // Favours precision
+ calculate_consensus_gap5(pos, flags, depth, plp, opts,
+ &consP, default_qual, cp_p);
+ // Favours recall
+ calculate_consensus_gap5(pos, flags, depth, plp, opts,
+ &consR, default_qual, cp_r);
+
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#define MAX(a,b) ((a)>(b)?(a):(b))
+
+ // Initial starting point is precise mode
+ memcpy(cons, &consP, sizeof(consP));
+
+ if (consP.phred > 0 && consR.phred > 0 && consP.call == consR.call) {
+ // Both strategies match as HOM
+ // Boost qual as both in agreement
+ cons->phred += MIN(20, consR.phred);
+
+ } else if (consP.het_logodd >= 0 && consR.het_logodd >= 0 &&
+ consP.het_call == consR.het_call) {
+ // Both strategies match as HET
+ // Boost qual as both in agreement
+ cons->het_logodd += MIN(20, consR.het_logodd);
+
+ } else if (consP.het_logodd >= 0) {
+ // Accurate method claims heterozygous, so go with it.
+ // However sensitive method disagrees, so reduce qual a little.
+ int q2 = MAX(consR.phred, consR.het_logodd);
+ cons->het_logodd = MAX(1, (cons->het_logodd - q2/2));
+
+ } else if (consR.het_logodd >= 70) {
+ // Accurate is homozygous and consR is het, so we go with it instead
+ // but at a lower quality value.
+ // TODO: may wish to check HET is consistent with HOM? Very unlikely
+ // not to be though.
+ int q1 = consP.phred;
+ int q2 = consR.het_logodd;
+ memcpy(cons, &consR, sizeof(consR));
+ cons->het_logodd = MIN(15, MAX((q2-q1*2)/2, 1+q2/(q1+1.0)));
+
+ } else if (consR.het_logodd >= 0) {
+ // As above, but low quality
+ int q1 = consP.phred;
+ int q2 = consR.het_logodd;
+ memcpy(cons, &consR, sizeof(consR));
+ cons->het_logodd = MAX(1,q2 - 0.3*q1)
+ + 5*(consP.het_call == consR.het_call);
+ cons->phred = 0;
+
+ } else if (consR.het_logodd < 0) {
+ // Neither are heterozygous, but differing in phred call (V rare).
+ // Pick highest qual, after some scaling?
+ consR.phred = consR.phred / 2;
+ if (consR.phred > consP.phred)
+ memcpy(cons, &consR, sizeof(consR));
+ cons->phred = MAX(10, cons->phred);
+ }
+
+ return 0;
+}
/* --------------------------------------------------------------------------
* Main processing logic
* standard pileup criteria (eg COG-UK / CLIMB Covid-19 seq project).
*
*
- * call1 / score1 / depth1 is the highest scoring allele.
- * call2 / score2 / depth2 is the second highest scoring allele.
+ * call1 / score1 is the highest scoring allele.
+ * call2 / score2 is the second highest scoring allele.
*
* Het_fract: score2/score1
* Call_fract: score1 or score1+score2 over total score
- * Min_depth: minimum total depth of utilised bases (depth1+depth2)
+ * Min_depth: minimum total depth of unfiltered bases (above qual/mqual)
* Min_score: minimum total score of utilised bases (score1+score2)
*
* Eg het_fract 0.66, call_fract 0.75 and min_depth 10.
static int calculate_consensus_simple(const pileup_t *plp,
consensus_opts *opts, int *qual) {
int i, min_qual = opts->min_qual;
+ int tot_depth = 0;
// Map "seqi" nt16 to A,C,G,T compatibility with weights on pure bases.
// where seqi is A | (C<<1) | (G<<2) | (T<<3)
freq[16] ++;
score[16]+=8 * (opts->use_qual ? q : 1);
}
+ tot_depth++;
}
// Total usable depth
// Best and second best potential calls
int call1 = 15, call2 = 15;
- int depth1 = 0, depth2 = 0;
int score1 = 0, score2 = 0;
for (i = 0; i < 5; i++) {
int c = 1<<i; // A C G T *
if (score1 < score[c]) {
- depth2 = depth1;
score2 = score1;
call2 = call1;
- depth1 = freq[c];
score1 = score[c];
call1 = c;
} else if (score2 < score[c]) {
- depth2 = freq[c];
score2 = score[c];
call2 = c;
}
// Work out which best and second best are usable as a call
int used_score = score1;
- int used_depth = depth1;
int used_base = call1;
if (score2 >= opts->het_fract * score1 && opts->ambig) {
used_base |= call2;
used_score += score2;
- used_depth += depth2;
}
// N is too shallow, or insufficient proportion of total
- if (used_depth < opts->min_depth ||
+ if (tot_depth < opts->min_depth ||
used_score < opts->call_fract * tscore) {
- used_depth = 0;
// But note shallow gaps are still called gaps, not N, as
// we're still more confident there is no base than it is
// A, C, G or T.
- used_base = call1 == 16 /*&& depth1 >= call_fract * depth*/
- ? 16 : 0; // * or N
+ used_base = call1 == 16 ? 16 : 0; // * or N
}
// Our final call. "?" shouldn't be possible to generate
"NACMGRSVTWYHKDBN"
"*ac?g???t???????";
- //printf("%c %d\n", het[used_base], used_depth);
+ //printf("%c %d\n", het[used_base], tot_depth);
if (qual)
*qual = used_base ? 100.0 * used_score / tscore : 0;
}
}
- if (opts->gap5) {
+ if (opts->mode != MODE_SIMPLE) {
consensus_t cons;
- calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0,
- depth, p, opts, &cons, opts->default_qual);
+ calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
+ depth, p, opts, &cons, opts->default_qual,
+ &cons_prob_recall, &cons_prob_precise);
if (cons.het_logodd > 0 && opts->ambig) {
cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
"MCSYc"
}
// share this with basic_pileup
- if (opts->gap5) {
+ if (opts->mode != MODE_SIMPLE) {
consensus_t cons;
- calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0,
- depth, p, opts, &cons, opts->default_qual);
+ calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
+ depth, p, opts, &cons, opts->default_qual,
+ &cons_prob_recall, &cons_prob_precise);
if (cons.het_logodd > 0 && opts->ambig) {
cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
"MCSYc"
opts->last_tid = tid;
return 0;
}
+ if (opts->mark_ins && nth && cb != '*') {
+ kputc('_', seq);
+ kputc('_', qual);
+ }
+
// end of share
// Append consensus base/qual to seqs
return 0;
}
+
// END OF NEW PILEUP
//---------------------------------------------------------------------------
fprintf(fp, " Exclude reads with any flag bit set\n");
fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n");
fprintf(fp, " --min-MQ INT Exclude reads with mapping quality below INT [0]\n");
+ fprintf(fp, " --min-BQ INT Exclude reads with base quality below INT [0]\n");
fprintf(fp, " --show-del yes/no Whether to show deletion as \"*\" [no]\n");
fprintf(fp, " --show-ins yes/no Whether to show insertions [yes]\n");
+ fprintf(fp, " --mark-ins Add '+' before every inserted base/qual [off]\n");
fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n");
fprintf(fp, "\nFor simple consensus mode:\n");
fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n");
fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n");
- fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n");
- fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.5]\n");
+ fprintf(fp, " -d, --min-depth INT Minimum depth of INT [2]\n");
+ fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.15]\n");
fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n");
fprintf(fp, " -C, --cutoff C Consensus cutoff quality C [10]\n");
fprintf(fp, " --(no-)adj-qual Modify quality with local minima [on]\n");
fprintf(fp, " --high-MQ INT Cap maximum mapping quality [60]\n");
fprintf(fp, " --P-het FLOAT Probability of heterozygous site[%.1e]\n",
P_HET);
+ fprintf(fp, " --P-indel FLOAT Probability of indel sites[%.1e]\n",
+ P_INDEL);
+ fprintf(fp, " --het-scale FLOAT Heterozygous SNP probability multiplier[%.1e]\n",
+ P_HET_SCALE);
+ fprintf(fp, " -p, --homopoly-fix Spread low-qual bases to both ends of homopolymers\n");
+ fprintf(fp, " --homopoly-score FLOAT\n"
+ " Qual fraction adjustment for -p option [%g]\n", P_HOMOPOLY);
+ fprintf(fp, " -t, --qual-calibration FILE / :config (see man page)\n");
+ fprintf(fp, " Load quality calibration file\n");
+ fprintf(fp, "\n");
+ fprintf(fp, " -X, --config STR Use pre-defined configuration set. STR from:\n");
+ fprintf(fp, " hiseq, hifi, r10.4_sup, r10.4_dup and ultima\n");
fprintf(fp, "\nGlobal options:\n");
sam_global_opt_help(fp, "-.---@-.");
consensus_opts opts = {
// User options
- .gap5 = 1,
+ .mode = MODE_RECALL,
.use_qual = 0,
.min_qual = 0,
.adj_qual = 1,
.all_bases = 0,
.show_del = 0,
.show_ins = 1,
+ .mark_ins = 0,
.incl_flags = 0,
.excl_flags = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP,
.min_mqual = 0,
.P_het = P_HET,
+ .P_indel = P_INDEL,
+ .het_scale = P_HET_SCALE,
+ .homopoly_fix = 0,
+ .homopoly_redux = 0.01,
// Internal state
.ks_line = {0,0},
.last_pos = -1,
};
+ set_qcal(&opts.qcal, QCAL_FLAT);
+
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'),
{"het-only", no_argument, NULL, 6},
{"show-del", required_argument, NULL, 7},
{"show-ins", required_argument, NULL, 8},
+ {"mark-ins", no_argument, NULL, 18},
{"output", required_argument, NULL, 'o'},
{"incl-flags", required_argument, NULL, 11},
{"rf", required_argument, NULL, 11},
{"excl-flags", required_argument, NULL, 12},
{"ff", required_argument, NULL, 12},
{"min-MQ", required_argument, NULL, 13},
+ {"min-BQ", required_argument, NULL, 16},
{"P-het", required_argument, NULL, 15},
+ {"P-indel", required_argument, NULL, 17},
+ {"het-scale", required_argument, NULL, 19},
{"mode", required_argument, NULL, 'm'},
+ {"homopoly-fix", no_argument, NULL, 'p'},
+ {"homopoly-score", required_argument, NULL, 'p'+100},
+ {"homopoly-redux", required_argument, NULL, 'p'+200},
+ {"qual-calibration", required_argument, NULL, 't'},
+ {"config", required_argument, NULL, 'X'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:",
+ while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:",
lopts, NULL)) >= 0) {
switch (c) {
case 'a': opts.all_bases++; break;
case 'r': opts.reg = optarg; break;
case 'C': opts.cons_cutoff = atoi(optarg); break;
case 'A': opts.ambig = 1; break;
+ case 'p': opts.homopoly_fix = P_HOMOPOLY; break;
+ case 'p'+100: opts.homopoly_fix = atof(optarg); break;
+ case 'p'+200:
+ // EXPERIMENTAL
+ opts.homopoly_redux = atof(optarg); break;
case 1: opts.default_qual = atoi(optarg); break;
case 6: opts.het_only = 1; break;
case 7: opts.show_del = (*optarg == 'y' || *optarg == 'Y'); break;
case 8: opts.show_ins = (*optarg == 'y' || *optarg == 'Y'); break;
+ case 18: opts.mark_ins = 1; break;
case 13: opts.min_mqual = atoi(optarg); break;
+ case 16: opts.min_qual = atoi(optarg); break;
case 15: opts.P_het = atof(optarg); break;
+ case 17: opts.P_indel = atof(optarg); break;
+ case 19: opts.het_scale = atof(optarg); break;
case 'q'+100: opts.adj_qual = 1; break;
case 'q'+101: opts.adj_qual = 0; break;
case 'm'+100: opts.nm_adjust = 1; break;
case 'm': // mode
if (strcasecmp(optarg, "simple") == 0) {
- opts.gap5 = 0;
- } else if (strcasecmp(optarg, "bayesian") == 0) {
- opts.gap5 = 1;
+ opts.mode = MODE_SIMPLE;
+ } else if (strcasecmp(optarg, "bayesian_m") == 0) {
+ // EXPERIMENTAL:
+ // A mixture of modified precise/recall params and a
+ // blending of the two. Sometimes helps a bit.
+ opts.mode = MODE_MIXED;
+ } else if (strcasecmp(optarg, "bayesian_p") == 0) {
+ // EXPERIMENTAL:
+ // favours precision
+ opts.mode = MODE_PRECISE;
+ } else if (strcasecmp(optarg, "bayesian_r") == 0 ||
+ strcasecmp(optarg, "bayesian") == 0) {
+ // favours recall; the default
+ opts.mode = MODE_RECALL;
+ } else if (strcasecmp(optarg, "bayesian_116") == 0) {
+ opts.mode = MODE_BAYES_116;
} else {
fprintf(stderr, "Unknown mode %s\n", optarg);
return 1;
}
break;
+ case 'X':
+ if (strcasecmp(optarg, "hifi") == 0) {
+ set_qcal(&opts.qcal, QCAL_HIFI);
+ opts.mode = MODE_RECALL;
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.low_mqual = 5;
+ opts.scale_mqual = 1.5;
+ opts.het_scale = 0.37;
+ } else if (strcasecmp(optarg, "hiseq") == 0) {
+ opts.mode = MODE_RECALL;
+ set_qcal(&opts.qcal, QCAL_HISEQ);
+ opts.homopoly_redux = 0.01;
+ } else if (strcasecmp(optarg, "r10.4_sup") == 0) {
+ // Same as HiFi params, but ONT calibration table.
+ // At higher depth, hifi params work well for ONT
+ // when combined with ONT calibration chart.
+ //
+ // At lower depth we gain a bit from increasing homopoly_redux
+ set_qcal(&opts.qcal, QCAL_ONT_R10_4_SUP);
+ opts.mode = MODE_RECALL;
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.low_mqual = 5;
+ opts.scale_mqual = 1.5;
+ opts.het_scale = 0.37;
+
+ // Also consider, for lower depth:
+ // opts.homopoly_redux = 1;
+ // opts.scale_mqual = 1;
+ // opts.het_scale = 0.45;
+ } else if (strcasecmp(optarg, "r10.4_dup") == 0) {
+ // Just a copy of of HiFi for duplex currently until
+ // we get a good truth set for calibration.
+ set_qcal(&opts.qcal, QCAL_ONT_R10_4_DUP);
+ opts.mode = MODE_RECALL;
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.low_mqual = 5;
+ opts.scale_mqual = 1.5;
+ opts.het_scale = 0.37;
+ } else if (strcasecmp(optarg, "ultima") == 0) {
+ // Very similar to HiFi, but with own calibration table
+ opts.mode = MODE_RECALL;
+ set_qcal(&opts.qcal, QCAL_ULTIMA);
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.het_scale = 0.37;
+ opts.scale_mqual = 2;
+ opts.low_mqual = 10;
+ } else {
+ // NB consider defaults that are a mixture of all above.
+ // Options are all similar for all bar Illumina.
+ // Unsure what :flat calibration table does to each of
+ // these though.
+ fprintf(stderr, "Unrecognised configuration name: \"%s\"\n",
+ optarg);
+ return 1;
+ }
+ break;
+
case 11:
if ((opts.incl_flags = bam_str2flag(optarg)) < 0) {
print_error("consensus", "could not parse --rf %s", optarg);
}
break;
+ case 't': // --qual-calibration
+ if (load_qcal(&opts.qcal, optarg) < 0) {
+ print_error("consensus",
+ "failed to load quality calibration '%s'",
+ optarg);
+ return -1;
+ }
+ break;
+
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?':
}
}
+#if 0
+ // Dump out the qcal table. Useful for copying into the code above.
+ int i;
+ qcal_t *q = &opts.qcal;
+ fprintf(stderr, "{");
+ for (i = 0; i < 100; i++)
+ fprintf(stderr, "%2d,%s", q->smap[i],(i+1)%10?" ":"\n");
+ fprintf(stderr, "},\n{");
+ for (i = 0; i < 100; i++)
+ fprintf(stderr, "%2d,%s", q->umap[i],(i+1)%10?" ":"\n");
+ fprintf(stderr, "},\n{");
+ for (i = 0; i < 100; i++)
+ fprintf(stderr, "%2d,%s", q->omap[i],(i+1)%10?" ":"\n");
+ fprintf(stderr, "}\n");
+#endif
+
+ if (opts.mode != MODE_SIMPLE) {
+ if (opts.mode == MODE_PRECISE)
+ // More accuracy / precision, but a significant drop
+ // in recall.
+ consensus_init(opts.P_het, opts.P_indel,
+ 0.3 * opts.het_scale, opts.homopoly_redux,
+ &opts.qcal, MODE_PRECISE, &cons_prob_precise);
+
+ if (opts.mode == MODE_MIXED)
+ // Blend these in when running in mixed mode, so we can
+ // keep sensitivity but have a better joint quality to
+ // reduce the FP rate.
+ consensus_init(pow(opts.P_het, 0.7), pow(opts.P_indel, 0.7),
+ 0.3 * opts.het_scale, opts.homopoly_redux,
+ &opts.qcal, MODE_PRECISE, &cons_prob_precise);
+
+ // Better recall, at a cost of some accuracy (false positives)
+ consensus_init(opts.P_het, opts.P_indel, opts.het_scale,
+ opts.mode == MODE_RECALL ? opts.homopoly_redux : 0.01,
+ &opts.qcal, MODE_RECALL, &cons_prob_recall);
+ }
+
if (argc != optind+1) {
if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
else usage_exit(stderr, EXIT_FAILURE);
}
if (opts.fmt == PILEUP) {
- if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL,
- basic_pileup, &opts) < 0)
+ if (pileup_loop(opts.fp, opts.h, readaln2,
+ opts.mode != MODE_SIMPLE ? nm_init : NULL,
+ basic_pileup,
+ opts.mode != MODE_SIMPLE ? nm_free : NULL,
+ &opts) < 0)
goto err;
if (opts.all_bases) {
goto err;
}
} else {
- if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL,
+ if (pileup_loop(opts.fp, opts.h, readaln2,
+ opts.mode != MODE_SIMPLE ? nm_init : NULL,
basic_fasta,
+ opts.mode != MODE_SIMPLE ? nm_free : NULL,
&opts) < 0)
goto err;
if (opts.all_bases) {
/* bam_consensus.c -- consensus subcommand.
Copyright (C) 1998-2001,2003 Medical Research Council (Gap4/5 source)
- Copyright (C) 2003-2005,2007-2022 Genome Research Ltd.
+ Copyright (C) 2003-2005,2007-2023 Genome Research Ltd.
Author: James Bonfield <jkb@sanger.ac.uk>
// but 30T+ 20T- 18A+ 2A- seems like a consistent A miscall on one strand
// only, while T is spread evenly across both strands.
+// TODO: Phasing of long reads.
+// Long reads offer very strong phasing opportunities for SNPs.
+// From these, we get strong evidence for accuracy of indels.
+// Specifically whether the distribution of poly-len within a phases
+// is significantly different to the distribution of poly len between
+// phases.
+
+// TODO end STR trimming. Eg:
+// REF AAGCTGAAAAGTTAATGTCTTATTTTTTTTTTTTTTTTGAGATGGAGTC
+// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc
+// aagctgaaaagttaatgtcttattttttttt
+// aagctgaaaagttaatgtctta****ttttttttttttgagatggagtc
+// Middle seq doesn't validate those initial T alignments.
+// Qual_train solves this by use of the STR trimmer.
+
+// TODO add a weight for proximity to homopolymer.
+// Maybe length/distance? So 3 away from a 12-mer is similar to 1 away
+// from a 4-mer?
+
+// TODO: Count number of base types between this point and the nearest
+// indel or end of read. Eg GATCG<here>AGAGAG*TAGC => 2 (A and G).
+// adj is nbase/4 * score, or (nbase+1)/5?
+// Perhaps multiplied by length too, to get local complexity score?
+
#include <config.h>
#include <stdio.h>
#include <ctype.h>
#include <htslib/sam.h>
+#include <htslib/hfile.h>
#include "samtools.h"
#include "sam_opts.h"
# define MAX(a,b) ((a)>(b)?(a):(b))
#endif
+// Defines for experiment code which is currently disabled
+
+// Hardy-Weinberg statistics to check heterozygous sites match allelic
+// frequencies.
+//#define DO_HDW
+
+// Filter bayesian calls by min-depth and min-fract parameters
+//#define DO_FRACT
+
+// Checks uniqueness of surrounding bases to adjust scores
+//#define K2 2
+
+// Look for strand bias in distribution of homopolymer lengths
+//#define DO_POLY_DIST
+
// Minimum cutoff for storing mod data; => at least 10% chance
#define MOD_CUTOFF 0.46
typedef unsigned char uc;
+// Simple recalibration table for substitutions, undercalls and overcalls.
+// In future, we'll update this to be kmer based too.
+typedef struct {
+ int smap[101]; // substituion or SNP
+ int umap[101]; // undercall or DEL
+ int omap[101]; // overcall or INS
+} qcal_t;
+
typedef struct {
// User options
char *reg;
int min_depth;
double call_fract;
double het_fract;
- int gap5;
+ int mode; // One of MODE_* macros below
enum format fmt;
int cons_cutoff;
int ambig;
int all_bases;
int show_del;
int show_ins;
+ int mark_ins;
int excl_flags;
int incl_flags;
int min_mqual;
double P_het;
+ double P_indel;
+ double het_scale;
+ double homopoly_fix;
+ double homopoly_redux;
+ qcal_t qcal;
// Internal state
samFile *fp;
float discrep;
} consensus_t;
-#define P_HET 1e-4
+#define P_HET 1e-3
+#define P_INDEL 2e-4
+#define P_HOMOPOLY 0.5
+#define P_HET_SCALE 1.0
#define LOG10 2.30258509299404568401
#define TENOVERLOG10 4.34294481903251827652
#define ALIGNED(x)
#endif
-static double prior[25] ALIGNED(16); /* Sum to 1.0 */
-static double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */
-
-/* Precomputed matrices for the consensus algorithm */
-static double pMM[101] ALIGNED(16);
-static double p__[101] ALIGNED(16);
-static double p_M[101] ALIGNED(16);
-
+// Initialised once as a global array. This won't work if threaded,
+// but we'll rewrite if and when that gets added later.
static double e_tab_a[1002] ALIGNED(16);
static double *e_tab = &e_tab_a[500];
static double e_tab2_a[1002] ALIGNED(16);
static double *e_tab2 = &e_tab2_a[500];
static double e_log[501] ALIGNED(16);
+/* Precomputed matrices for the consensus algorithm */
+typedef struct {
+ double prior[25] ALIGNED(16); /* Sum to 1.0 */
+ double lprior15[15] ALIGNED(16); /* 15 combinations of {ACGT*} */
+
+ double pMM[101] ALIGNED(16);
+ double p__[101] ALIGNED(16);
+ double p_M[101] ALIGNED(16);
+ double po_[101] ALIGNED(16);
+ double poM[101] ALIGNED(16);
+ double poo[101] ALIGNED(16);
+ double puu[101] ALIGNED(16);
+ double pum[101] ALIGNED(16);
+ double pmm[101] ALIGNED(16);
+
+ // Multiplier on homopolymer length before reducing phred qual
+ double poly_mul;
+} cons_probs;
+
+// Two sets of params; recall oriented (gap5) and precision (stf).
+// We use the former unless MODE_MIXED is set (which is the default
+// for bayesian consensus mode if P_indel is significant).
+static cons_probs cons_prob_recall, cons_prob_precise;
+
/*
* Lots of confusing matrix terms here, so some definitions will help.
*
* The heterozygosity weight though is a per column calculation as we're
* trying to model whether the column is pure or mixed. Hence this is done
* once via a prior and has no affect on the individual matrix cells.
+ *
+ * We have a generic indel probability, but it's a catch all for overcall,
+ * undercall, alignment artifacts, homopolymer issues, etc. So we can set
+ * it considerably higher and just let the QUAL skew do the filtering for
+ * us, albeit no longer well calibrated.
*/
-static void consensus_init(double p_het) {
+// NB: Should _M be MM?
+// Ie sample really is A/C het, and we observe C. That should be a match,
+// not half a match.
+
+#define MODE_SIMPLE 0 // freq counting
+
+#define MODE_BAYES_116 1 // Samtools 1.16 (no indel param)
+#define MODE_RECALL 2 // so called as it's the params from Gap5
+#define MODE_PRECISE 3 // a more precise set; +FN, --FP
+#define MODE_MIXED 4 // Combination of GAP5/BAYES
+
+#define QCAL_FLAT 0
+#define QCAL_HIFI 1
+#define QCAL_HISEQ 2
+#define QCAL_ONT_R10_4_SUP 3
+#define QCAL_ONT_R10_4_DUP 4
+#define QCAL_ULTIMA 5
+
+// Calibration tables here don't necessarily reflect the true accuracy.
+// They have been manually tuned to work in conjunction with other command
+// line parameters used in the machine profiles. For example reducing one
+// qual here and increasing sensitivity elsewhere via another parameter.
+static qcal_t static_qcal[6] = {
+ { // FLAT
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99},
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}
+ },
+
+ { // HiFi
+ {10, 11, 11, 12, 13, 14, 15, 16, 18, 19,
+ 20, 21, 22, 23, 24, 25, 27, 28, 29, 30,
+ 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+ 38, 39, 39, 40, 40, 41, 41, 41, 41, 42,
+ 42, 42, 42, 43, 43, 43, 43, 43, 43, 43,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ },
+ { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9,
+ 10, 11, 11, 12, 13, 14, 15, 15, 16, 17,
+ 18, 19, 19, 20, 20, 21, 22, 23, 23, 24,
+ 25, 25, 25, 26, 26, 26, 27, 27, 28, 28,
+ 28, 28, 27, 27, 27, 28, 28, 28, 28, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 26, 26, 25, 26, 26, 27, 27, 27,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27,
+ 28, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+ 27, 28, 28, 30, 30, 30, 30, 30, 30, 30,
+ },
+ { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14,
+ 15, 15, 16, 17, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 23, 23, 23, 24, 24, 24, 25,
+ 25, 25, 25, 25, 25, 26, 26, 26, 26, 27,
+ 27, 27, 27, 27, 27, 28, 28, 28, 28, 28,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ }
+ },
+
+ { // HiSeq
+ { 2, 2, 2, 3, 3, 4, 5, 5, 6, 7,
+ 8, 9, 10, 11, 11, 12, 13, 14, 15, 16,
+ 17, 17, 18, 19, 20, 21, 22, 22, 23, 24,
+ 25, 26, 27, 28, 28, 29, 30, 31, 32, 33,
+ 34, 34, 35, 36, 37, 38, 39, 39, 40, 41,
+ 42, 43, 44, 45, 45, 46, 47, 48, 49, 50,
+ 51, 51, 52, 53, 54, 55, 56, 56, 57, 58,
+ 59, 60, 61, 62, 62, 63, 64, 65, 66, 67,
+ 68, 68, 69, 70, 71, 72, 73, 73, 74, 75,
+ 76, 77, 78, 79, 79, 80, 81, 82, 83, 84,
+ },
+ { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11,
+ 13, 14, 15, 16, 17, 19, 20, 21, 22, 23,
+ 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
+ 37, 38, 39, 40, 41, 43, 44, 45, 46, 47,
+ 49, 50, 51, 52, 53, 55, 56, 57, 58, 59,
+ 61, 62, 63, 64, 65, 67, 68, 69, 70, 71,
+ 73, 74, 75, 76, 77, 79, 80, 81, 82, 83,
+ 85, 86, 87, 88, 89, 91, 92, 93, 94, 95,
+ 97, 98, 99, 100, 101, 103, 104, 105, 106, 107,
+ 109, 110, 111, 112, 113, 115, 116, 117, 118, 119,
+ },
+ { 1, 2, 3, 4, 5, 7, 8, 9, 10, 11,
+ 13, 14, 15, 16, 17, 19, 20, 21, 22, 23,
+ 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
+ 37, 38, 39, 40, 41, 43, 44, 45, 46, 47,
+ 49, 50, 51, 52, 53, 55, 56, 57, 58, 59,
+ 61, 62, 63, 64, 65, 67, 68, 69, 70, 71,
+ 73, 74, 75, 76, 77, 79, 80, 81, 82, 83,
+ 85, 86, 87, 88, 89, 91, 92, 93, 94, 95,
+ 97, 98, 99, 100, 101, 103, 104, 105, 106, 107,
+ 109, 110, 111, 112, 113, 115, 116, 117, 118, 119,
+ }
+ },
+ { // ONT R10.4 super
+ { 0, 2, 2, 2, 3, 4, 4, 5, 6, 7,
+ 7, 8, 9, 12, 13, 14, 15, 15, 16, 17,
+ 18, 19, 20, 22, 24, 25, 26, 27, 28, 29,
+ 30, 31, 33, 34, 36, 37, 38, 38, 39, 39,
+ 40, 40, 40, 40, 40, 40, 40, 41, 40, 40,
+ 41, 41, 40, 40, 40, 40, 41, 40, 40, 40,
+ 40, 41, 41, 40, 40, 41, 40, 40, 39, 41,
+ 40, 41, 40, 40, 41, 41, 41, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+ 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+ },
+ { 0, 2, 2, 2, 3, 4, 5, 6, 7, 8,
+ 8, 9, 9, 10, 10, 10, 11, 12, 12, 13,
+ 13, 13, 14, 14, 15, 16, 16, 17, 18, 18,
+ 19, 19, 20, 21, 22, 23, 24, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ },
+ { 0, 4, 6, 6, 6, 7, 7, 8, 9, 9,
+ 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 15, 15, 15, 16, 16, 17, 17, 18, 18, 19,
+ 19, 20, 20, 21, 22, 22, 23, 23, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ }
+ },
+ { // ONT R10.4 duplex; just a copy of hifi for now
+ {10, 11, 11, 12, 13, 14, 15, 16, 18, 19,
+ 20, 21, 22, 23, 24, 25, 27, 28, 29, 30,
+ 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+ 38, 39, 39, 40, 40, 41, 41, 41, 41, 42,
+ 42, 42, 42, 43, 43, 43, 43, 43, 43, 43,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 44, 44, 44, 44,
+ },
+ { 4, 4, 4, 4, 5, 6, 6, 7, 8, 9,
+ 10, 11, 11, 12, 13, 14, 15, 15, 16, 17,
+ 18, 19, 19, 20, 20, 21, 22, 23, 23, 24,
+ 25, 25, 25, 26, 26, 26, 27, 27, 28, 28,
+ 28, 28, 27, 27, 27, 28, 28, 28, 28, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 26, 26, 25, 26, 26, 27, 27, 27,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27,
+ 28, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+ 27, 28, 28, 30, 30, 30, 30, 30, 30, 30,
+ },
+ { 8, 8, 8, 8, 9, 10, 11, 12, 13, 14,
+ 15, 15, 16, 17, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 23, 23, 23, 24, 24, 24, 25,
+ 25, 25, 25, 25, 25, 26, 26, 26, 26, 27,
+ 27, 27, 27, 27, 27, 28, 28, 28, 28, 28,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+ }
+ },
+ { // Ultima Genomics
+ { 2, 2, 3, 4, 5, 6, 6, 7, 8, 9,
+ 10, 10, 11, 12, 13, 14, 14, 15, 16, 17,
+ 18, 18, 19, 21, 22, 23, 23, 24, 25, 26,
+ 27, 27, 28, 29, 30, 31, 31, 32, 33, 34,
+ 35, 35, 36, 37, 38, 39, 39, 40, 42, 43,
+ 44, 44, 45, 46, 47, 48, 48, 49, 50, 51,
+ 52, 52, 53, 54, 55, 56, 56, 57, 58, 59,
+ 60, 60, 61, 63, 64, 65, 65, 66, 67, 68,
+ 69, 69, 70, 71, 72, 73, 73, 74, 75, 76,
+ 77, 77, 78, 79, 80, 81, 81, 82, 84, 85,
+ },
+ { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4,
+ 5, 5, 6, 6, 7, 7, 8, 8, 9, 10,
+ 10, 10, 11, 12, 13, 13, 13, 14, 15, 16,
+ 16, 16, 17, 18, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 22, 22, 23, 23, 24, 24, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ },
+ { 1, 1, 2, 2, 3, 3, 4, 4, 4, 4,
+ 5, 5, 6, 6, 7, 7, 8, 8, 9, 10,
+ 10, 10, 11, 12, 13, 13, 13, 14, 15, 16,
+ 16, 16, 17, 18, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 22, 22, 23, 23, 24, 24, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ }
+ }
+};
+
+int set_qcal(qcal_t *q, int id) {
+ if (id < 0 || id >= sizeof(static_qcal)/sizeof(*static_qcal))
+ return -1;
+
+ memcpy(q, &static_qcal[id], sizeof(*q));
+ return 0;
+}
+
+int load_qcal(qcal_t *q, const char *fn) {
int i;
+ if (strcmp(fn, ":hifi") == 0)
+ return set_qcal(q, QCAL_HIFI);
+ if (strcmp(fn, ":hiseq") == 0)
+ return set_qcal(q, QCAL_HISEQ);
+ if (strcmp(fn, ":r10.4_sup") == 0)
+ return set_qcal(q, QCAL_ONT_R10_4_SUP);
+ if (strcmp(fn, ":r10.4_dup") == 0)
+ return set_qcal(q, QCAL_ONT_R10_4_DUP);
+ if (strcmp(fn, ":ultima") == 0)
+ return set_qcal(q, QCAL_ULTIMA);
+
+ // default
+ for (i = 0; i < 101; i++)
+ q->smap[i] = q->umap[i] = q->omap[i] = i;
+
+ if (strcmp(fn, ":flat") == 0)
+ return 0;
+
+ hFILE *fp = hopen(fn, "r");
+ if (!fp)
+ return -1;
+
+ kstring_t line = KS_INITIALIZE;
+ int max = 0;
+ int last_qual = 0;
+ while (line.l = 0, kgetline(&line, (kgets_func *)hgets, fp) >= 0) {
+ int v, s, u, o;
+ if (*line.s == '#')
+ continue;
+ if (sscanf(line.s, "QUAL %d %d %d %d", &v, &s, &u, &o) != 4)
+ goto err;
+ while (v > last_qual) {
+ q->smap[last_qual+1] = q->smap[last_qual];
+ q->umap[last_qual+1] = q->umap[last_qual];
+ q->omap[last_qual+1] = q->omap[last_qual];
+ last_qual++;
+ }
+ if (v >= 0 && v < 100) {
+ q->smap[v] = s;
+ q->umap[v] = u;
+ q->omap[v] = o;
+ }
+ if (v < max) {
+ fprintf(samtools_stderr, "Qual calibration file is not in ascending order\n");
+ return hclose(fp) ? -2 : -1;
+ }
+ max = v;
+ }
+
+ for (i = max+1; i < 101; i++) {
+ q->smap[i] = q->smap[max];
+ q->umap[i] = q->umap[max];
+ q->omap[i] = q->omap[max];
+ }
+
+ ks_free(&line);
+ return hclose(fp) < 0 ? -2 : 0;
+
+ err:
+ ks_free(&line);
+ return hclose(fp) < 0 ? -2 : -1;
+}
+
+static void consensus_init(double p_het, double p_indel, double het_scale,
+ double poly_mul,
+ qcal_t *qcal, int mode, cons_probs *cp) {
+ int i;
+
+ // NB: only need to initialise once, but we do here for now
for (i = -500; i <= 500; i++)
e_tab[i] = exp(i);
for (i = -500; i <= 500; i++)
for (i = 0; i <= 500; i++)
e_log[i] = log(i);
- // Heterozygous locations
+ // EXPERIMENTAL
+ cp->poly_mul = poly_mul;
+
+ // The priors make very little difference, unless shallow data.
+ // ACGT* by ACGT*
+ // So AA=0, CC=6, GG=12, TT=18, **=24
for (i = 0; i < 25; i++)
- prior[i] = p_het / 20;
- prior[0] = prior[6] = prior[12] = prior[18] = prior[24] = (1-p_het)/5;
-
- lprior15[0] = log(prior[0]);
- lprior15[1] = log(prior[1]*2);
- lprior15[2] = log(prior[2]*2);
- lprior15[3] = log(prior[3]*2);
- lprior15[4] = log(prior[4]*2);
- lprior15[5] = log(prior[6]);
- lprior15[6] = log(prior[7]*2);
- lprior15[7] = log(prior[8]*2);
- lprior15[8] = log(prior[9]*2);
- lprior15[9] = log(prior[12]);
- lprior15[10] = log(prior[13]*2);
- lprior15[11] = log(prior[14]*2);
- lprior15[12] = log(prior[18]);
- lprior15[13] = log(prior[19]*2);
- lprior15[14] = log(prior[24]);
-
-
- // Rewrite as new form
+ cp->prior[i] = p_het / 6; // AC AG AT CG CT GT
+
+ // Flat assumption that it is what we observe, and measure everything else
+ // as relative to this.
+ cp->prior[0]=cp->prior[6]=cp->prior[12]=cp->prior[18]=cp->prior[24] = 1;
+
+ // heterozygous deletion
+ for (i = 4; i < 24; i+=5)
+ cp->prior[i] = p_indel / 6; // /6 to be scaled vs p_het equivalently
+
+ // heterozygous insertion
+ for (i = 20; i < 24; i++)
+ cp->prior[i] = p_indel / 6;
+
+ cp->lprior15[0] = log(cp->prior[0]);
+ cp->lprior15[1] = log(cp->prior[1]);
+ cp->lprior15[2] = log(cp->prior[2]);
+ cp->lprior15[3] = log(cp->prior[3]);
+ cp->lprior15[4] = log(cp->prior[4]);
+ cp->lprior15[5] = log(cp->prior[6]);
+ cp->lprior15[6] = log(cp->prior[7]);
+ cp->lprior15[7] = log(cp->prior[8]);
+ cp->lprior15[8] = log(cp->prior[9]);
+ cp->lprior15[9] = log(cp->prior[12]);
+ cp->lprior15[10] = log(cp->prior[13]);
+ cp->lprior15[11] = log(cp->prior[14]);
+ cp->lprior15[12] = log(cp->prior[18]);
+ cp->lprior15[13] = log(cp->prior[19]);
+ cp->lprior15[14] = log(cp->prior[24]);
+
for (i = 1; i < 101; i++) {
- double prob = 1 - pow(10, -i / 10.0);
-
- // May want to multiply all these by 5 so pMM[i] becomes close
- // to -0 for most data. This makes the sums increment very slowly,
- // keeping bit precision in the accumulator.
- pMM[i] = log(prob/5);
- p__[i] = log((1-prob)/20);
- p_M[i] = log((exp(pMM[i]) + exp(p__[i]))/2);
+ double prob = 1 - pow(10, -qcal->smap[i] / 10.0);
+
+ // Or is it that prob is 1-p(subst)-p(overcall)?
+ cp->pMM[i] = log(prob);
+
+ //cp->p__[i] = log(1-prob); // Big help to PB-CCS SNPs; unless fudged
+ cp->p__[i] = log((1-prob)/3); // correct? poor on PB-CCS w/o fudge
+
+ // Mixed alleles; just average two likelihoods
+ cp->p_M[i] = log((exp(cp->pMM[i]) + exp(cp->p__[i]))/2);
+
+ // What does this really mean? Can we simulate this by priors?
+ // It reduces the likelihood of calling het sites, which is
+ // maybe compensation for alignment artifacts? I'm unsure,
+ // but it works (to differing degrees) on both PacBio HiFi and
+ // Illumina HiSeq. It (obviously) loses true hets, but
+ // potentially this can be compensated for by tweaking P-het
+ // (which is entirely in the priors).
+ //
+ // Low het_scale reduces false positives by making hets less
+ // likely to be called. In high depth data we normally have
+ // enough evidence to call correctly even with low het_scale,
+ // so it's a good +FN vs --FP tradeoff. However on low depth
+ // data, het_scale can filter out too many true variants.
+ //
+ // TODO: So consider adjusting at the end maybe?
+ // Also consider never changing calls, but changing their
+ // confidence, so the data is what produces the call with the
+ // parameters skewing the quality score distribution.
+ cp->p_M[i] += log(het_scale);
+
+ if (mode == MODE_BAYES_116) {
+ // Compatibility with samtools 1.16
+
+ // This had no differention for indel vs substitution error rates,
+ // so o(vercall) and u(undercall) are subst(_).
+ cp->pmm[i] = cp->pMM[i];
+ cp->poM[i] = cp->p_M[i];
+ cp->pum[i] = cp->p_M[i];
+ cp->po_[i] = cp->p__[i];
+ cp->poo[i] = cp->p__[i];
+ cp->puu[i] = cp->p__[i];
+
+ } else {
+ // When observing A C G T; leads to insertion calls
+ prob = 1 - pow(10, -qcal->omap[i] / 10.0);
+ // /3 for consistency with ACGT rem as relative likelihoods.
+ // Otherwise with flat priors we end up calling all shallow data
+ // as "*", which is illogical.
+ cp->poo[i] = log((1-prob)/3);
+
+ // Ensure pMM is always more likely. (NB: This shouldn't happen
+ // now with the addition of the /3 step above.)
+ if (cp->poo[i] > cp->pMM[i]-.5)
+ cp->poo[i] = cp->pMM[i]-.5;
+
+ cp->po_[i] = log((exp(cp->poo[i]) + exp(cp->p__[i]))/2);
+ cp->poM[i] = log((exp(cp->poo[i]) + exp(cp->pMM[i]))/2);
+
+ // Overcalls should never be twice as likely than mismatches.
+ // Het bases are mix of _M (other) and MM ops (this).
+ // It's fine for _M to be less likely than oM (more likely
+ // to be overcalled than miscalled), but it should never
+ // be stronger when combined with other mixed data.
+ if (cp->poM[i] > cp->p_M[i]+.5)
+ cp->poM[i] = cp->p_M[i]+.5;
+
+ // Note --low-MQ and --scale-MQ have a big impact on
+ // undercall errs. May need to separate these options per
+ // type, but how?
+ // Multiple-calls, as with mixed mode? This feels like a cheat
+
+ prob = 1 - pow(10, -qcal->umap[i] / 10.0);
+ cp->pmm[i] = log(prob);
+ cp->puu[i] = log((1-prob)/3);
+ if (cp->puu[i] > cp->pMM[i]-.5) // MM is -ve
+ cp->puu[i] = cp->pMM[i]-.5;
+
+ cp->pum[i] = log((exp(cp->puu[i]) + exp(cp->pmm[i]))/2);
+ }
}
- pMM[0] = pMM[1];
- p__[0] = p__[1];
- p_M[0] = p_M[1];
+ cp->pMM[0] = cp->pMM[1];
+ cp->p__[0] = cp->p__[1];
+ cp->p_M[0] = cp->p_M[1];
+
+ cp->pmm[0] = cp->pmm[1];
+ cp->poo[0] = cp->poo[1];
+ cp->po_[0] = cp->po_[1];
+ cp->poM[0] = cp->poM[1];
+ cp->puu[0] = cp->puu[1];
+ cp->pum[0] = cp->pum[1];
}
static inline double fast_exp(double y) {
return indel;
}
+/*
+ * Some machines, including 454 and PacBio, store the quality values in
+ * homopolymers with the first or last base always being the low quality
+ * state. This can cause problems when reverse-complementing and aligning,
+ * especially when we left-justify indels.
+ *
+ * Other platforms take the approach of having the middle bases high and
+ * the low confidence spread evenly to both start and end. This means
+ * reverse-complementing doesn't introduce any strand bias.
+ *
+ * We redistribute qualities within homopolymers in this style to fix
+ * naive consensus or variant calling algorithms.
+ */
+void homopoly_qual_fix(bam1_t *b) {
+ static double ph2err[256] = {0};
+ int i;
+ if (!ph2err[0]) {
+ for (i = 0; i < 256; i++)
+ ph2err[i] = pow(10, i/-10.0);
+ }
+ uint8_t *seq = bam_get_seq(b);
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; i++) {
+ int s = i; // start of homopoly
+ int base = bam_seqi(seq, i);
+ while (i+1 < b->core.l_qseq && bam_seqi(seq, i+1) == base)
+ i++;
+ // s..i inclusive is now homopolymer
+
+ if (s == i)
+ continue;
+
+ // Simplest: reverse if end_qual < start_qual
+ // Next: average outer-most two, then next two, etc
+ // Best: fully redistribute so start/end lower qual than centre
+
+ // Middle route of averaging outer pairs is sufficient?
+ int j, k;
+ for (j = s, k = i; j < k; j++,k--) {
+ double e = ph2err[qual[j]] + ph2err[qual[k]];
+ qual[j] = qual[k] = -fast_log2(e/2)*3.0104+.49;
+ }
+ }
+}
+
// Return the local NM figure within halo (+/- HALO) of pos.
// This local NM is used as a way to modify MAPQ to get a localised MAPQ
// score via an adhoc fashion.
return 0;
pos -= b->core.pos;
if (pos < 0)
- return nm[0];
+ return nm[0] & ((1<<24)-1);
if (pos >= b->core.l_qseq)
- return nm[b->core.l_qseq-1];
+ return nm[b->core.l_qseq-1] & ((1<<24)-1);
+
+ return (nm[pos] & ((1<<24)-1)) / 10.0;
+}
- return nm[pos] / 10.0;
+int poly_len(const pileup_t *p, const bam1_t *b, hts_pos_t pos) {
+ int *nm = (int *)p->cd;
+ if (!nm)
+ return 0;
+ pos -= b->core.pos;
+ if (pos >= 0 && pos < b->core.l_qseq)
+ return nm[pos] >> 24;
+ else
+ return 0;
}
/*
const bam1_t *b = &p->b;
int qlen = b->core.l_qseq, i;
+ if (qlen <= 0)
+ return 0;
int *local_nm = calloc(qlen, sizeof(*local_nm));
if (!local_nm)
return -1;
p->cd = local_nm;
+ double poly_adj = opts->homopoly_fix ? opts->homopoly_fix : 1;
+
if (opts->adj_qual) {
-#if 0
- // Tweak by localised quality.
- // Quality is reduced by a significant portion of the minimum quality
- // in neighbouring bases, on the pretext that if the region is bad, then
- // this base is bad even if it claims otherwise.
+ // Set local_nm based on a function of current qual and the local
+ // minimum qual within the surrounding window.
+ //
+ // Basically if we're in a region of low confidence then we downgrade
+ // higher qual outliers as they may not be as trustworthy as they
+ // claim. This may be because the qualities have been assigned to
+ // the wrong or arbitrary base (very common in homopolymers), or the
+ // surrounding quality (hence also error likelihood) have lead to
+ // misalignments and the base may be contributing to the wrong
+ // pileup column.
+ //
+ // The nm_local() function returns these scores and uses it to bias
+ // the mapping quality, which in turn adjusts base quality.
uint8_t *qual = bam_get_qual(b);
- const int qhalo = 8; // 2?
- int qmin = 50; // effectively caps PacBio qual too
+ uint8_t *seq = bam_get_seq(b);
+ const int qhalo = 8; // window size for base qual
+ int qmin = qual[0]; // min qual within qhalo
+ const int qhalop = 2;// window size for homopolymer qual
+ int qminp = qual[0]; // min qual within homopolymer halo
+ int base = bam_seqi(seq, 0), polyl = 0, polyr = 0; // pos, not len
+
+ // Minimum quality of the initial homopolymer
+ for (i = 1; i < qlen; i++) {
+ if (bam_seqi(seq, i) != base)
+ break;
+ if (i < qhalop && qminp > qual[i])
+ qminp = qual[i];
+ }
+
+ // Minimum quality for general bases
for (i = 0; i < qlen && i < qhalo; i++) {
- local_nm[i] = qual[i];
if (qmin > qual[i])
qmin = qual[i];
}
+
for (;i < qlen-qhalo; i++) {
- //int t = (qual[i]*1 + 3*qmin)/4; // good on 60x
- int t = (qual[i] + 5*qmin)/4; // good on 15x
- local_nm[i] = t < qual[i] ? t : qual[i];
- if (qmin > qual[i+qhalo])
- qmin = qual[i+qhalo];
- else if (qmin <= qual[i-qhalo]) {
+ if (opts->homopoly_fix && bam_seqi(seq, i) != base) {
+ polyl = i;
+ base = bam_seqi(seq, i);
+ qminp = qual[i];
int j;
- qmin = 50;
- for (j = i-qhalo+1; j <= i+qhalo; j++)
- if (qmin > qual[j])
- qmin = qual[j];
+ for (j = i+1; j < qlen; j++) {
+ if (bam_seqi(seq, j) != base)
+ break;
+ if (i < qhalop && qminp > qual[j])
+ qminp = qual[j];
+ }
+ polyr = j-1;
+ } else {
+ // CHECK: do we want to have opts->homopoly_fix above,
+ // so when not in use we don't define pl to non-zero?
+ // Test on SynDip
+ polyr = polyl;
}
- }
- for (; i < qlen; i++) {
- local_nm[i] = qual[i];
- local_nm[i] = (local_nm[i] + 6*qmin)/4;
- }
+ int pl = polyr-polyl;
- for (i = 0; i < qlen; i++) {
- qual[i] = local_nm[i];
+ // Useful for SNPS, as we're judging the variation in
+ // length as an indicator for base-misalignment.
+ // Not so useful for indel calling where the longer the indel
+ // the less confident we are on the len variation being real.
+ int t = (opts->mode == MODE_BAYES_116)
+ ? (qual[i] + 5*qmin)/4
+ : qual[i]/3 + (qminp-pl*2)*poly_adj;
- // Plus overall rescale.
- // Lower becomes lower, very high becomes a little higher.
- // Helps deep GIAB, but detrimental elsewhere. (What this really
- // indicates is quality calibration differs per data set.)
- // It's probably something best accounted for somewhere else.
- //qual[i] = qual[i]*qual[i]/40+1;
- }
- memset(local_nm, 0, qlen * sizeof(*local_nm));
-#else
- // Skew local NM by qual vs min-qual delta
- uint8_t *qual = bam_get_qual(b);
- const int qhalo = 8; // 4
- int qmin = 99;
- for (i = 0; i < qlen && i < qhalo; i++) {
- if (qmin > qual[i])
- qmin = qual[i];
- }
- for (;i < qlen-qhalo; i++) {
- int t = (qual[i] + 5*qmin)/4; // good on 15x
- local_nm[i] += t < qual[i] ? (qual[i]-t) : 0;
+ local_nm[i] += t < qual[i] ? qual[i]-t : 0;
+
+ // Brute force qminp in polyl to polyr range.
+ // TODO: optimise this with sliding window
+ qminp = qual[i];
+ int k;
+ for (k = MAX(polyl,i-qhalop); k <= MIN(polyr,i+qhalop); k++)
+ if (qminp > qual[k])
+ qminp = qual[k];
+
if (qmin > qual[i+qhalo])
qmin = qual[i+qhalo];
else if (qmin <= qual[i-qhalo]) {
}
}
for (; i < qlen; i++) {
- int t = (qual[i] + 5*qmin)/4; // good on 15x
- local_nm[i] += t < qual[i] ? (qual[i]-t) : 0;
+ int t = (opts->mode == MODE_BAYES_116)
+ ? (qual[i] + 5*qmin)/4
+ : qual[i]/3 + qminp*poly_adj;
+ local_nm[i] += t < qual[i] ? qual[i]-t : 0;
}
-#endif
+ }
+
+ // Fix e.g. PacBio homopolymer qualities
+ if (opts->homopoly_fix)
+ homopoly_qual_fix((bam1_t *)b);
+
+ // local_nm[i] & ((1<<24)-1) is for SNP score adjustment.
+ // We also put some more basic poly-X len in local_nm[i] >> 24.
+ uint8_t *seq = bam_get_seq(b);
+ for (i = 0; i < qlen; i++) {
+ int base = bam_seqi(seq, i);
+ int poly = 0, j, k;
+ for (j = i+1; j < qlen; j++)
+ if (bam_seqi(seq, j) != base)
+ break;
+ //printf("%d x %d\n", base, j-i);
+
+ poly = j-i-1; if (poly > 100) poly = 100;
+ const int HALO=0;
+ for (k = i-HALO; k < j+HALO; k++)
+ if (k >= 0 && k < qlen)
+ local_nm[k] = ((MAX(poly, local_nm[k]>>24))<<24)
+ | (local_nm[k] & ((1<<24)-1));
+
+ i = j-1;
}
// Adjust local_nm array by the number of edits within
}
// substitution
- for (i = pos-halo*2 >= 0 ? pos-halo*2 : 0; i < pos-halo; i++)
+ for (i = pos-halo*2 >= 0 ?pos-halo*2 :0; i < pos-halo && i < qlen; i++)
local_nm[i]+=5;
for (; i < pos+halo && i < qlen; i++)
local_nm[i]+=10;
return 1;
}
+void nm_free(void *client_data, samFile *fp, sam_hdr_t *h, pileup_t *p) {
+ free(p->cd);
+ p->cd = NULL;
+}
+
+#ifdef DO_HDW
+/*
+ * Stirling's formula with a 1/12n correction applied to improve accuracy.
+ * This seems to hold remarkably true for both low and high numbers too.
+ */
+double lnfact(double n) {
+ /* Or Gosper's formula...
+ * return (n*ln(n) - n + ln(2*M_PI*n + M_PI/3) / 2);
+ */
+ return ((n+0.5)*log(n) - n + log(2*M_PI)/2) + log(1 + 1/(12.0*n));
+ /* + log(1 + 1/(288.0*n*n)); */
+}
+
+/*
+ * The binomical coefficient (n,k) for n trials with k successes where
+ * prob(success) = p.
+ * k n-k
+ * P (k|n) = n! / (k! (n-k)!) p (1-p)
+ * p
+ *
+ * The coefficient we are returning here is the n! / (k! (n-k)!) bit.
+ * We compute it using ln(n!) and then exp() the result back to avoid
+ * excessively large numbers.
+ */
+double bincoef(int n, double k) {
+ return exp(lnfact(n) - lnfact(k) - lnfact(n-k));
+}
+
+/*
+ * Given p == 0.5 the binomial expansion simplifies a bit, so we have
+ * a dedicated function for this.
+ */
+double binprobhalf(int n, double k) {
+ return bincoef(n, k) * pow(0.5, n);
+}
+
+double lnbinprobhalf(int n, double k) {
+ // ln(binprobhalf) expanded up and simplified
+ return lnfact(n) - lnfact(k) - lnfact(n-k) - 0.69315*n;
+}
+#endif
static
int calculate_consensus_gap5(hts_pos_t pos, int flags, int depth,
pileup_t *plp, consensus_opts *opts,
- consensus_t *cons, int default_qual) {
+ consensus_t *cons, int default_qual,
+ cons_probs *cp) {
int i, j;
static int init_done =0;
static double q2p[101], mqual_pow[256];
// if it's rare.
// Helps a bit on deep data, especially with K2=3, but detrimental on
// shallow and (currently) quite a slow down.
-
-//#define K2 2
#ifdef K2
int hashN[1<<(K2*4+2)] = {0};
int hash1[1<<2] = {0};
if (!init_done) {
init_done = 1;
- consensus_init(opts->P_het);
for (i = 0; i <= 100; i++) {
q2p[i] = pow(10, -i/10.0);
/* Initialise */
int counts[6] = {0};
+#ifdef DO_FRACT
+ int counts2[2][6] = {{0}};
+#endif
/* Accumulate */
int td = depth; // original depth
depth = 0;
+#ifdef DO_POLY_DIST
+ int poly_dist[2][100] = {0};
+#endif
for (; plp; plp = plp->next) {
pileup_t *p = plp;
int base = i >= 0 && i < p->b.core.l_qseq ? X[bam_seqi(seq,i)] : _;
hb = (hb<<2)|base;
}
- // fprintf(samtools_stderr, "%c: %d %d of %d\t%d %d\n", p->base, hashN[hb], hash1[base1], td, p->qual, p->qual * hashN[hb] / hash1[base1]);
#undef _
#endif
// convert from sam base to acgt*n order.
base = L[base];
- double MM, __, _M, qe;
+ double MM, __, _M, oo, oM, o_, uu, um, mm, qe;
// Correction for mapping quality. Maybe speed up via lookups?
// Cannot nullify mapping quality completely. Lots of (true)
if (flags & CONS_MQUAL) {
int mqual = b->core.qual;
if (opts->nm_adjust) {
- mqual /= (nm_local(p, b, pos)+1);
+ //mqual /= (nm_local(p, b, pos)+1);
+ mqual /= (nm_local(p, b, b->core.pos + p->seq_offset+1)+1);
mqual *= 1 + 2*(0.5-(td>30?30:td)/60.0); // depth fudge
}
if (qual < 1)
qual = 1;
- __ = p__[qual]; // neither match
- MM = pMM[qual] - __; // both match
- _M = p_M[qual] - __; // one allele only (half match)
+ double poly = poly_len(p, b, b->core.pos + p->seq_offset+1);
+#ifdef DO_POLY_DIST
+ poly_dist[bam_is_rev(b)][MIN(99,(int)poly)]++;
+#endif
+
+ // EXPERIMENTAL
+ // Adjust qual based on homopolymer length.
+ // Affects different platforms by differing amounts.
+ // May wish to further separate to qual2 and qual3 for ins and del?
+ int qual2 = MAX(1, qual-(poly-2)*cp->poly_mul);
+
+ /* MM=match _M=half-match __=mismatch */
+ __ = cp->p__[qual]; // neither match
+ MM = cp->pMM[qual] - __; // both match
+ _M = cp->p_M[qual] - __; // one allele only (half match)
+
+ /* observation ACGT, but against hypothesis ** or *base */
+ oo = cp->poo[qual2] - __;
+ oM = cp->poM[qual2] - __;
+ o_ = cp->po_[qual2] - __;
+
+ /* observation * */
+ uu = cp->puu[qual2] - __;
+ um = cp->pum[qual2] - __;
+ mm = cp->pmm[qual2] - __;
if (flags & CONS_DISCREP) {
qe = q2p[qual];
sumsC[base] += 1 - qe;
}
+
counts[base]++;
+#ifdef DO_FRACT
+ counts2[bam_is_rev(b)][base]++;
+#endif
+
+ // oM should never be higher than _M for actual bases! or...
+ //printf("base %d@%d MM %f _M %f oM %f\n", base, qual, MM, _M, oM);
switch (base) {
case 0: // A
- S[0] += MM;
- S[1] += _M;
- S[2] += _M;
- S[3] += _M;
- S[4] += _M;
+ S[0] += MM;
+ S[1] += _M;
+ S[2] += _M;
+ S[3] += _M;
+ S[4] += oM;
+ S[8] += o_;
+ S[11] += o_;
+ S[13] += o_;
+ S[14] += oo;
break;
case 1: // C
- S[1] += _M;
- S[5] += MM;
- S[6] += _M;
- S[7] += _M;
- S[8] += _M;
+ S[1] += _M;
+ S[5] += MM;
+ S[6] += _M;
+ S[7] += _M;
+ S[8] += oM;
+ S[4] += o_;
+ S[11] += o_;
+ S[13] += o_;
+ S[14] += oo;
+
+ //fprintf(samtools_stderr, "%d %f %f %f\n", qual, MM+__, oo+__, MM-oo);
break;
case 2: // G
S[ 6] += _M;
S[ 9] += MM;
S[10] += _M;
- S[11] += _M;
+ S[11] += oM;
+ S[4] += o_;
+ S[8] += o_;
+ S[13] += o_;
+ S[14] += oo;
break;
case 3: // T
- S[ 3] += _M;
+ S[ 3] += _M; // _m
S[ 7] += _M;
S[10] += _M;
- S[12] += MM;
- S[13] += _M;
+ S[12] += MM; // mm
+ S[13] += oM;
+ S[4] += o_;
+ S[8] += o_;
+ S[11] += o_;
+ S[14] += oo;
+ // S[14] oo
break;
case 4: // *
- S[ 4] += _M;
- S[ 8] += _M;
- S[11] += _M;
- S[13] += _M;
- S[14] += MM;
+ // under under under under agree-no-base
+ S[0] += uu; S[1 ]+= uu; S[2 ]+= uu; S[3 ]+= uu; S[4 ]+= um;
+ S[5 ]+= uu; S[6 ]+= uu; S[7 ]+= uu; S[8 ]+= um;
+ S[9 ]+= uu; S[10]+= uu; S[11]+= um;
+ S[12]+= uu; S[13]+= um;
+ S[14]+= mm;
break;
case 5: /* N => equal weight to all A,C,G,T but not a pad */
- S[ 0] += MM;
- S[ 1] += MM;
- S[ 2] += MM;
- S[ 3] += MM;
- S[ 4] += _M;
-
- S[ 5] += MM;
- S[ 6] += MM;
- S[ 7] += MM;
- S[ 8] += _M;
-
- S[ 9] += MM;
- S[10] += MM;
- S[11] += _M;
-
- S[12] += MM;
- S[13] += _M;
+ S[0] += MM; S[1 ]+= MM; S[2 ]+= MM; S[3 ]+= MM; S[4 ]+= oM;
+ S[5 ]+= MM; S[6 ]+= MM; S[7 ]+= MM; S[8 ]+= oM;
+ S[9 ]+= MM; S[10]+= MM; S[11]+= oM;
+ S[12]+= MM; S[13]+= oM;
+ S[14]+= oo;
break;
}
depth++;
+ }
+
+#ifdef DO_POLY_DIST
+ // Or compute mean and s.d per strand.
+ // Then compare likelihood of strands coming from the same distribution?
+ // eg s.d=0.59 vs mean=3.41 sd=0.54... hmm
+ //
+ // Or compare ratio of most frequent to next most frequent, for each
+ // strand.
+
+ int d1 = 0, d2 = 0;
+ double nd1 = 0, nd2 = 0;
+ int k;
+ for (k = 0; k < 100; k++) {
+ if (!poly_dist[0][k] && !poly_dist[1][k])
+ continue;
- if (p->eof && p->cd) {
- free(p->cd);
- p->cd = NULL;
+// fprintf(samtools_stdout, "%ld %d %2d %2d\n", pos, k, poly_dist[0][k], poly_dist[1][k]);
+ d1 += (k+1)*poly_dist[0][k];
+ d2 += (k+1)*poly_dist[1][k];
+ nd1 += poly_dist[0][k];
+ nd2 += poly_dist[1][k];
+ }
+// fprintf(samtools_stdout, "Avg = %f / %f %f / %f / %f\n",
+// (d1+d2+1)/(nd1+nd2+1.),
+// (d1+1)/(nd1+1.), (d2+1)/(nd2+1.),
+// (d2+1)/(nd2+1.) - (d1+1)/(nd1+1.),
+// ((d2+1)/(nd2+1.) - (d1+1)/(nd1+1.)) / ((d1+d2+1)/(nd1+nd2+1.)));
+
+ // Find the top two frequent lengths
+ int n1 = 0, n2 = 0, l1 = 0, l2 = 0;
+ for (k = 0; k < 100; k++) {
+ int poly12 = poly_dist[0][k]+poly_dist[1][k];
+ if (n1 < poly12) {
+ n2 = n1; l2 = l1;
+ n1 = poly12;
+ l1 = k;
+ } else if (n2 < poly12) {
+ n2 = poly12;
+ l2 = k;
}
}
+ const double N = 5;
+ nd1 += 1;
+ nd2 += 1;
+
+ // l1 is most common length
+ int pn1p = poly_dist[0][l1];
+ int pn1m = poly_dist[1][l1];
+ // l2 2nd most common
+ int pn2p = poly_dist[0][l2];
+ int pn2m = poly_dist[1][l2];
+
+ // ratio if two most common lengths on +
+ double s1 = (pn1p+N) / (pn2p+N); s1 = s1>1?1/s1:s1;
+ // ratio if two most common lengths on -
+ double s2 = (pn1m+N) / (pn2m+N); s2 = s2>1?1/s2:s2;
+
+ // ratio of s1 and s2 to identify strand bias
+ double sbias = s1 / s2; sbias = sbias>1?1/sbias:sbias;
+
+ if (pn2p+pn2m > 0 && l1 != l2) {
+// fprintf(samtools_stdout, "len %d,%d + %d,%d - %d,%d\tbias = %f %f, %f %f\t%ld\n",
+// l1, l2, pn1p, pn2p, pn1m, pn2m,
+// s1, s2, sbias, sqrt(sbias)-1, pos);
+
+ // adjust score for het indels
+ // sbias is close to 0 for strong strand bias, and 1 for none
+ sbias = 10*log(sbias);//+.5);
+ S[ 4] += sbias; // A*
+ S[ 8] += sbias; // C*
+ S[11] += sbias; // G*
+ S[13] += sbias; // T*
+ } else {
+ sbias = 0;
+ }
+#endif
+
/* We've accumulated stats, so now we speculate on the consensus call */
double shift, max, max_het, norm[15];
int call = 0, het_call = 0;
max = -DBL_MAX;
max_het = -DBL_MAX;
+#ifdef DO_FRACT
+ // Filter by --min-depth and --het-fract.
+ // Also add a slight adjustment for strand bias.
+ for (j = 0; j < 15; j++) {
+ if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14)
+ continue;
+
+ double c1p = counts2[0][map_het[j]%5];
+ double c1m = counts2[1][map_het[j]%5];
+ double c2p = counts2[0][map_het[j]/5];
+ double c2m = counts2[1][map_het[j]/5];
+
+ double c1 = c1p + c1m;
+ double c2 = c2p + c2m;
+
+ if (c1 && c2) {
+ // Slight decrease in confidence if strong strand bias.
+ const int N = 10; // avoid low sample size problems
+ double b1 = 1 - (N+MIN(c1p,c1m))/(N+MAX(c1p,c1m));
+ double b2 = 1 - (N+MIN(c2p,c2m))/(N+MAX(c2p,c2m));
+ if (b1 > 0.5) S[j] -= b1;
+ if (b2 > 0.5) S[j] -= b2;
+
+ // Fraction based filtering, via --min-depth and --het-fract opts.
+ c1 += 1e-5;
+ c2 += 1e-5;
+ if (c2 > c1) {
+ double tmp = c2;
+ c2 = c1;
+ c1 = tmp;
+ }
+
+ if (c2 < opts->min_depth)
+ S[j] -= 100;
+ if (c2 / (c1+1e-5) <= opts->het_fract)
+ S[j] -= 100;
+ }
+ }
+#endif
+
+#ifdef DO_HDW
+ /*
+ * Apply Hardy-Weinberg statistics for heterozygous sites.
+ * This helps, but it also loses sensitivity a little.
+ */
for (j = 0; j < 15; j++) {
- S[j] += lprior15[j];
+ if (j == 0 || j == 5 || j == 9 || j == 12 || j == 14)
+ continue;
+
+ double c1 = counts[map_het[j]%5];
+ double c2 = counts[map_het[j]/5];
+
+ if (c1 && c2) {
+ c1 += 1e-5;
+ c2 += 1e-5;
+ if (c2 > c1) {
+ double tmp = c2;
+ c2 = c1;
+ c1 = tmp;
+ }
+
+ // Limit depth for HW as we'll have an allele freq difference,
+ // even if it's just caused by alignment reference bias.
+ double c12 = c1+c2;
+ if (c12 > 20) {
+ c2 *= 20/(c12);
+ c12 = 20;
+ c1 = 20-c2;
+ }
+
+ // Helps a little, especially reducing FN deletions.
+ c1+=1;
+ c2+=1;
+ c12+=2;
+ S[j] += lnbinprobhalf(c12, c2) + fast_log2(c12)*0.69+.2;
+ }
+ }
+#endif
+
+ for (j = 0; j < 15; j++) {
+ S[j] += cp->lprior15[j];
if (shift < S[j])
shift = S[j];
return 0;
}
+// If opts->gap5 is MODE_MIXED then we use two different parameter
+// sets, favouring cp_p for precision and cp_r for recall. Otherwise it's
+// always cp_r only.
+//
+// When both calls equal, we return the same result. When they differ,
+// we adjust qual based on accurate vs recall profiles.
+int calculate_consensus_gap5m(hts_pos_t pos, int flags, int depth,
+ pileup_t *plp, consensus_opts *opts,
+ consensus_t *cons, int default_qual,
+ cons_probs *cp_r, cons_probs *cp_p) {
+ if (opts->mode != MODE_MIXED)
+ return calculate_consensus_gap5(pos, flags, depth, plp, opts,
+ cons, default_qual,
+ opts->mode == MODE_PRECISE
+ ? cp_p : cp_r);
+
+ // EXPERIMENTAL: mixed mode
+ consensus_t consP, consR;
+ // Favours precision
+ calculate_consensus_gap5(pos, flags, depth, plp, opts,
+ &consP, default_qual, cp_p);
+ // Favours recall
+ calculate_consensus_gap5(pos, flags, depth, plp, opts,
+ &consR, default_qual, cp_r);
+
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#define MAX(a,b) ((a)>(b)?(a):(b))
+
+ // Initial starting point is precise mode
+ memcpy(cons, &consP, sizeof(consP));
+
+ if (consP.phred > 0 && consR.phred > 0 && consP.call == consR.call) {
+ // Both strategies match as HOM
+ // Boost qual as both in agreement
+ cons->phred += MIN(20, consR.phred);
+
+ } else if (consP.het_logodd >= 0 && consR.het_logodd >= 0 &&
+ consP.het_call == consR.het_call) {
+ // Both strategies match as HET
+ // Boost qual as both in agreement
+ cons->het_logodd += MIN(20, consR.het_logodd);
+
+ } else if (consP.het_logodd >= 0) {
+ // Accurate method claims heterozygous, so go with it.
+ // However sensitive method disagrees, so reduce qual a little.
+ int q2 = MAX(consR.phred, consR.het_logodd);
+ cons->het_logodd = MAX(1, (cons->het_logodd - q2/2));
+
+ } else if (consR.het_logodd >= 70) {
+ // Accurate is homozygous and consR is het, so we go with it instead
+ // but at a lower quality value.
+ // TODO: may wish to check HET is consistent with HOM? Very unlikely
+ // not to be though.
+ int q1 = consP.phred;
+ int q2 = consR.het_logodd;
+ memcpy(cons, &consR, sizeof(consR));
+ cons->het_logodd = MIN(15, MAX((q2-q1*2)/2, 1+q2/(q1+1.0)));
+
+ } else if (consR.het_logodd >= 0) {
+ // As above, but low quality
+ int q1 = consP.phred;
+ int q2 = consR.het_logodd;
+ memcpy(cons, &consR, sizeof(consR));
+ cons->het_logodd = MAX(1,q2 - 0.3*q1)
+ + 5*(consP.het_call == consR.het_call);
+ cons->phred = 0;
+
+ } else if (consR.het_logodd < 0) {
+ // Neither are heterozygous, but differing in phred call (V rare).
+ // Pick highest qual, after some scaling?
+ consR.phred = consR.phred / 2;
+ if (consR.phred > consP.phred)
+ memcpy(cons, &consR, sizeof(consR));
+ cons->phred = MAX(10, cons->phred);
+ }
+
+ return 0;
+}
/* --------------------------------------------------------------------------
* Main processing logic
* standard pileup criteria (eg COG-UK / CLIMB Covid-19 seq project).
*
*
- * call1 / score1 / depth1 is the highest scoring allele.
- * call2 / score2 / depth2 is the second highest scoring allele.
+ * call1 / score1 is the highest scoring allele.
+ * call2 / score2 is the second highest scoring allele.
*
* Het_fract: score2/score1
* Call_fract: score1 or score1+score2 over total score
- * Min_depth: minimum total depth of utilised bases (depth1+depth2)
+ * Min_depth: minimum total depth of unfiltered bases (above qual/mqual)
* Min_score: minimum total score of utilised bases (score1+score2)
*
* Eg het_fract 0.66, call_fract 0.75 and min_depth 10.
static int calculate_consensus_simple(const pileup_t *plp,
consensus_opts *opts, int *qual) {
int i, min_qual = opts->min_qual;
+ int tot_depth = 0;
// Map "seqi" nt16 to A,C,G,T compatibility with weights on pure bases.
// where seqi is A | (C<<1) | (G<<2) | (T<<3)
freq[16] ++;
score[16]+=8 * (opts->use_qual ? q : 1);
}
+ tot_depth++;
}
// Total usable depth
// Best and second best potential calls
int call1 = 15, call2 = 15;
- int depth1 = 0, depth2 = 0;
int score1 = 0, score2 = 0;
for (i = 0; i < 5; i++) {
int c = 1<<i; // A C G T *
if (score1 < score[c]) {
- depth2 = depth1;
score2 = score1;
call2 = call1;
- depth1 = freq[c];
score1 = score[c];
call1 = c;
} else if (score2 < score[c]) {
- depth2 = freq[c];
score2 = score[c];
call2 = c;
}
// Work out which best and second best are usable as a call
int used_score = score1;
- int used_depth = depth1;
int used_base = call1;
if (score2 >= opts->het_fract * score1 && opts->ambig) {
used_base |= call2;
used_score += score2;
- used_depth += depth2;
}
// N is too shallow, or insufficient proportion of total
- if (used_depth < opts->min_depth ||
+ if (tot_depth < opts->min_depth ||
used_score < opts->call_fract * tscore) {
- used_depth = 0;
// But note shallow gaps are still called gaps, not N, as
// we're still more confident there is no base than it is
// A, C, G or T.
- used_base = call1 == 16 /*&& depth1 >= call_fract * depth*/
- ? 16 : 0; // * or N
+ used_base = call1 == 16 ? 16 : 0; // * or N
}
// Our final call. "?" shouldn't be possible to generate
"NACMGRSVTWYHKDBN"
"*ac?g???t???????";
- //printf("%c %d\n", het[used_base], used_depth);
+ //printf("%c %d\n", het[used_base], tot_depth);
if (qual)
*qual = used_base ? 100.0 * used_score / tscore : 0;
}
}
- if (opts->gap5) {
+ if (opts->mode != MODE_SIMPLE) {
consensus_t cons;
- calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0,
- depth, p, opts, &cons, opts->default_qual);
+ calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
+ depth, p, opts, &cons, opts->default_qual,
+ &cons_prob_recall, &cons_prob_precise);
if (cons.het_logodd > 0 && opts->ambig) {
cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
"MCSYc"
}
// share this with basic_pileup
- if (opts->gap5) {
+ if (opts->mode != MODE_SIMPLE) {
consensus_t cons;
- calculate_consensus_gap5(pos, opts->use_mqual ? CONS_MQUAL : 0,
- depth, p, opts, &cons, opts->default_qual);
+ calculate_consensus_gap5m(pos, opts->use_mqual ? CONS_MQUAL : 0,
+ depth, p, opts, &cons, opts->default_qual,
+ &cons_prob_recall, &cons_prob_precise);
if (cons.het_logodd > 0 && opts->ambig) {
cb = "AMRWa" // 5x5 matrix with ACGT* per row / col
"MCSYc"
opts->last_tid = tid;
return 0;
}
+ if (opts->mark_ins && nth && cb != '*') {
+ kputc('_', seq);
+ kputc('_', qual);
+ }
+
// end of share
// Append consensus base/qual to seqs
return 0;
}
+
// END OF NEW PILEUP
//---------------------------------------------------------------------------
fprintf(fp, " Exclude reads with any flag bit set\n");
fprintf(fp, " [UNMAP,SECONDARY,QCFAIL,DUP]\n");
fprintf(fp, " --min-MQ INT Exclude reads with mapping quality below INT [0]\n");
+ fprintf(fp, " --min-BQ INT Exclude reads with base quality below INT [0]\n");
fprintf(fp, " --show-del yes/no Whether to show deletion as \"*\" [no]\n");
fprintf(fp, " --show-ins yes/no Whether to show insertions [yes]\n");
+ fprintf(fp, " --mark-ins Add '+' before every inserted base/qual [off]\n");
fprintf(fp, " -A, --ambig Enable IUPAC ambiguity codes [off]\n");
fprintf(fp, "\nFor simple consensus mode:\n");
fprintf(fp, " -q, --(no-)use-qual Use quality values in calculation [off]\n");
fprintf(fp, " -c, --call-fract INT At least INT portion of bases must agree [0.75]\n");
- fprintf(fp, " -d, --min-depth INT Minimum depth of INT [1]\n");
- fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.5]\n");
+ fprintf(fp, " -d, --min-depth INT Minimum depth of INT [2]\n");
+ fprintf(fp, " -H, --het-fract INT Minimum fraction of 2nd-most to most common base [0.15]\n");
fprintf(fp, "\nFor default \"Bayesian\" consensus mode:\n");
fprintf(fp, " -C, --cutoff C Consensus cutoff quality C [10]\n");
fprintf(fp, " --(no-)adj-qual Modify quality with local minima [on]\n");
fprintf(fp, " --high-MQ INT Cap maximum mapping quality [60]\n");
fprintf(fp, " --P-het FLOAT Probability of heterozygous site[%.1e]\n",
P_HET);
+ fprintf(fp, " --P-indel FLOAT Probability of indel sites[%.1e]\n",
+ P_INDEL);
+ fprintf(fp, " --het-scale FLOAT Heterozygous SNP probability multiplier[%.1e]\n",
+ P_HET_SCALE);
+ fprintf(fp, " -p, --homopoly-fix Spread low-qual bases to both ends of homopolymers\n");
+ fprintf(fp, " --homopoly-score FLOAT\n"
+ " Qual fraction adjustment for -p option [%g]\n", P_HOMOPOLY);
+ fprintf(fp, " -t, --qual-calibration FILE / :config (see man page)\n");
+ fprintf(fp, " Load quality calibration file\n");
+ fprintf(fp, "\n");
+ fprintf(fp, " -X, --config STR Use pre-defined configuration set. STR from:\n");
+ fprintf(fp, " hiseq, hifi, r10.4_sup, r10.4_dup and ultima\n");
fprintf(fp, "\nGlobal options:\n");
sam_global_opt_help(fp, "-.---@-.");
consensus_opts opts = {
// User options
- .gap5 = 1,
+ .mode = MODE_RECALL,
.use_qual = 0,
.min_qual = 0,
.adj_qual = 1,
.all_bases = 0,
.show_del = 0,
.show_ins = 1,
+ .mark_ins = 0,
.incl_flags = 0,
.excl_flags = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP,
.min_mqual = 0,
.P_het = P_HET,
+ .P_indel = P_INDEL,
+ .het_scale = P_HET_SCALE,
+ .homopoly_fix = 0,
+ .homopoly_redux = 0.01,
// Internal state
.ks_line = {0,0},
.last_pos = -1,
};
+ set_qcal(&opts.qcal, QCAL_FLAT);
+
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', '-', '-', '@'),
{"het-only", no_argument, NULL, 6},
{"show-del", required_argument, NULL, 7},
{"show-ins", required_argument, NULL, 8},
+ {"mark-ins", no_argument, NULL, 18},
{"output", required_argument, NULL, 'o'},
{"incl-flags", required_argument, NULL, 11},
{"rf", required_argument, NULL, 11},
{"excl-flags", required_argument, NULL, 12},
{"ff", required_argument, NULL, 12},
{"min-MQ", required_argument, NULL, 13},
+ {"min-BQ", required_argument, NULL, 16},
{"P-het", required_argument, NULL, 15},
+ {"P-indel", required_argument, NULL, 17},
+ {"het-scale", required_argument, NULL, 19},
{"mode", required_argument, NULL, 'm'},
+ {"homopoly-fix", no_argument, NULL, 'p'},
+ {"homopoly-score", required_argument, NULL, 'p'+100},
+ {"homopoly-redux", required_argument, NULL, 'p'+200},
+ {"qual-calibration", required_argument, NULL, 't'},
+ {"config", required_argument, NULL, 'X'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:",
+ while ((c = getopt_long(argc, argv, "@:qd:c:H:r:5f:C:aAl:o:m:pt:X:",
lopts, NULL)) >= 0) {
switch (c) {
case 'a': opts.all_bases++; break;
case 'r': opts.reg = optarg; break;
case 'C': opts.cons_cutoff = atoi(optarg); break;
case 'A': opts.ambig = 1; break;
+ case 'p': opts.homopoly_fix = P_HOMOPOLY; break;
+ case 'p'+100: opts.homopoly_fix = atof(optarg); break;
+ case 'p'+200:
+ // EXPERIMENTAL
+ opts.homopoly_redux = atof(optarg); break;
case 1: opts.default_qual = atoi(optarg); break;
case 6: opts.het_only = 1; break;
case 7: opts.show_del = (*optarg == 'y' || *optarg == 'Y'); break;
case 8: opts.show_ins = (*optarg == 'y' || *optarg == 'Y'); break;
+ case 18: opts.mark_ins = 1; break;
case 13: opts.min_mqual = atoi(optarg); break;
+ case 16: opts.min_qual = atoi(optarg); break;
case 15: opts.P_het = atof(optarg); break;
+ case 17: opts.P_indel = atof(optarg); break;
+ case 19: opts.het_scale = atof(optarg); break;
case 'q'+100: opts.adj_qual = 1; break;
case 'q'+101: opts.adj_qual = 0; break;
case 'm'+100: opts.nm_adjust = 1; break;
case 'm': // mode
if (strcasecmp(optarg, "simple") == 0) {
- opts.gap5 = 0;
- } else if (strcasecmp(optarg, "bayesian") == 0) {
- opts.gap5 = 1;
+ opts.mode = MODE_SIMPLE;
+ } else if (strcasecmp(optarg, "bayesian_m") == 0) {
+ // EXPERIMENTAL:
+ // A mixture of modified precise/recall params and a
+ // blending of the two. Sometimes helps a bit.
+ opts.mode = MODE_MIXED;
+ } else if (strcasecmp(optarg, "bayesian_p") == 0) {
+ // EXPERIMENTAL:
+ // favours precision
+ opts.mode = MODE_PRECISE;
+ } else if (strcasecmp(optarg, "bayesian_r") == 0 ||
+ strcasecmp(optarg, "bayesian") == 0) {
+ // favours recall; the default
+ opts.mode = MODE_RECALL;
+ } else if (strcasecmp(optarg, "bayesian_116") == 0) {
+ opts.mode = MODE_BAYES_116;
} else {
fprintf(samtools_stderr, "Unknown mode %s\n", optarg);
return 1;
}
break;
+ case 'X':
+ if (strcasecmp(optarg, "hifi") == 0) {
+ set_qcal(&opts.qcal, QCAL_HIFI);
+ opts.mode = MODE_RECALL;
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.low_mqual = 5;
+ opts.scale_mqual = 1.5;
+ opts.het_scale = 0.37;
+ } else if (strcasecmp(optarg, "hiseq") == 0) {
+ opts.mode = MODE_RECALL;
+ set_qcal(&opts.qcal, QCAL_HISEQ);
+ opts.homopoly_redux = 0.01;
+ } else if (strcasecmp(optarg, "r10.4_sup") == 0) {
+ // Same as HiFi params, but ONT calibration table.
+ // At higher depth, hifi params work well for ONT
+ // when combined with ONT calibration chart.
+ //
+ // At lower depth we gain a bit from increasing homopoly_redux
+ set_qcal(&opts.qcal, QCAL_ONT_R10_4_SUP);
+ opts.mode = MODE_RECALL;
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.low_mqual = 5;
+ opts.scale_mqual = 1.5;
+ opts.het_scale = 0.37;
+
+ // Also consider, for lower depth:
+ // opts.homopoly_redux = 1;
+ // opts.scale_mqual = 1;
+ // opts.het_scale = 0.45;
+ } else if (strcasecmp(optarg, "r10.4_dup") == 0) {
+ // Just a copy of of HiFi for duplex currently until
+ // we get a good truth set for calibration.
+ set_qcal(&opts.qcal, QCAL_ONT_R10_4_DUP);
+ opts.mode = MODE_RECALL;
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.low_mqual = 5;
+ opts.scale_mqual = 1.5;
+ opts.het_scale = 0.37;
+ } else if (strcasecmp(optarg, "ultima") == 0) {
+ // Very similar to HiFi, but with own calibration table
+ opts.mode = MODE_RECALL;
+ set_qcal(&opts.qcal, QCAL_ULTIMA);
+ opts.homopoly_fix = 0.3;
+ opts.homopoly_redux = 0.01;
+ opts.het_scale = 0.37;
+ opts.scale_mqual = 2;
+ opts.low_mqual = 10;
+ } else {
+ // NB consider defaults that are a mixture of all above.
+ // Options are all similar for all bar Illumina.
+ // Unsure what :flat calibration table does to each of
+ // these though.
+ fprintf(samtools_stderr, "Unrecognised configuration name: \"%s\"\n",
+ optarg);
+ return 1;
+ }
+ break;
+
case 11:
if ((opts.incl_flags = bam_str2flag(optarg)) < 0) {
print_error("consensus", "could not parse --rf %s", optarg);
}
break;
+ case 't': // --qual-calibration
+ if (load_qcal(&opts.qcal, optarg) < 0) {
+ print_error("consensus",
+ "failed to load quality calibration '%s'",
+ optarg);
+ return -1;
+ }
+ break;
+
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?':
}
}
+#if 0
+ // Dump out the qcal table. Useful for copying into the code above.
+ int i;
+ qcal_t *q = &opts.qcal;
+ fprintf(samtools_stderr, "{");
+ for (i = 0; i < 100; i++)
+ fprintf(samtools_stderr, "%2d,%s", q->smap[i],(i+1)%10?" ":"\n");
+ fprintf(samtools_stderr, "},\n{");
+ for (i = 0; i < 100; i++)
+ fprintf(samtools_stderr, "%2d,%s", q->umap[i],(i+1)%10?" ":"\n");
+ fprintf(samtools_stderr, "},\n{");
+ for (i = 0; i < 100; i++)
+ fprintf(samtools_stderr, "%2d,%s", q->omap[i],(i+1)%10?" ":"\n");
+ fprintf(samtools_stderr, "}\n");
+#endif
+
+ if (opts.mode != MODE_SIMPLE) {
+ if (opts.mode == MODE_PRECISE)
+ // More accuracy / precision, but a significant drop
+ // in recall.
+ consensus_init(opts.P_het, opts.P_indel,
+ 0.3 * opts.het_scale, opts.homopoly_redux,
+ &opts.qcal, MODE_PRECISE, &cons_prob_precise);
+
+ if (opts.mode == MODE_MIXED)
+ // Blend these in when running in mixed mode, so we can
+ // keep sensitivity but have a better joint quality to
+ // reduce the FP rate.
+ consensus_init(pow(opts.P_het, 0.7), pow(opts.P_indel, 0.7),
+ 0.3 * opts.het_scale, opts.homopoly_redux,
+ &opts.qcal, MODE_PRECISE, &cons_prob_precise);
+
+ // Better recall, at a cost of some accuracy (false positives)
+ consensus_init(opts.P_het, opts.P_indel, opts.het_scale,
+ opts.mode == MODE_RECALL ? opts.homopoly_redux : 0.01,
+ &opts.qcal, MODE_RECALL, &cons_prob_recall);
+ }
+
if (argc != optind+1) {
if (argc == optind) usage_exit(samtools_stdout, EXIT_SUCCESS);
else usage_exit(samtools_stderr, EXIT_FAILURE);
}
if (opts.fmt == PILEUP) {
- if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL,
- basic_pileup, &opts) < 0)
+ if (pileup_loop(opts.fp, opts.h, readaln2,
+ opts.mode != MODE_SIMPLE ? nm_init : NULL,
+ basic_pileup,
+ opts.mode != MODE_SIMPLE ? nm_free : NULL,
+ &opts) < 0)
goto err;
if (opts.all_bases) {
goto err;
}
} else {
- if (pileup_loop(opts.fp, opts.h, readaln2, opts.gap5 ? nm_init : NULL,
+ if (pileup_loop(opts.fp, opts.h, readaln2,
+ opts.mode != MODE_SIMPLE ? nm_init : NULL,
basic_fasta,
+ opts.mode != MODE_SIMPLE ? nm_free : NULL,
&opts) < 0)
goto err;
if (opts.all_bases) {
* samtools import a_1.fq a_2.fq
* samtools import a_interleaved.fq
*
- * Copyright (C) 2020 Genome Research Ltd.
+ * Copyright (C) 2020-2021 Genome Research Ltd.
*
* Author: James Bonfield <jkb@sanger.ac.uk>
*/
* samtools import a_1.fq a_2.fq
* samtools import a_interleaved.fq
*
- * Copyright (C) 2020 Genome Research Ltd.
+ * Copyright (C) 2020-2021 Genome Research Ltd.
*
* Author: James Bonfield <jkb@sanger.ac.uk>
*/
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2022 Genome Research Ltd.
+ Copyright (C) 2017-2023 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
int rgx_t;
char *barcode;
regex_t *bc_rgx;
+ int read_groups;
+ int json;
} md_param_t;
typedef struct {
int32_t this_ref;
int32_t other_ref;
int32_t barcode;
+ int32_t read_group;
int8_t single;
int8_t leftmost;
int8_t orientation;
key_data_t single_key;
bam1_t *b;
struct read_queue_s *duplicate;
+ struct read_queue_s *original;
hts_pos_t pos;
int dup_checked;
+ int read_group;
} read_queue_t;
typedef struct {
typedef struct {
char *name;
char type;
+ int read_group;
} dup_map_t;
typedef struct {
int end;
} check_t;
-
typedef struct {
check_t *c;
size_t size;
size_t length;
} check_list_t;
+typedef struct {
+ long reading;
+ long writing;
+ long excluded;
+ long duplicate;
+ long single;
+ long pair;
+ long single_dup;
+ long examined;
+ long optical;
+ long single_optical;
+ long np_duplicate;
+ long np_opt_duplicate;
+} stats_block_t;
+
static khint32_t do_hash(unsigned char *key, khint32_t len);
static khint_t hash_key(key_data_t key) {
khint_t hash;
if (key.single) {
- unsigned char sig[17];
+ unsigned char sig[21];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.orientation, 1); i += 1;
memcpy(sig + i, &key.barcode, 4); i += 4;
+ memcpy(sig + i, &key.read_group, 4); i += 4;
hash = do_hash(sig, i);
} else {
- unsigned char sig[30];
+ unsigned char sig[34];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.leftmost, 1); i += 1;
memcpy(sig + i, &key.orientation, 1); i += 1;
memcpy(sig + i, &key.barcode, 4); i += 4;
+ memcpy(sig + i, &key.read_group, 4); i += 4;
hash = do_hash(sig, i);
}
match = 0;
else if (a.barcode != b.barcode)
match = 0;
+ else if (a.read_group != b.read_group)
+ match = 0;
if (!a.single) {
if (a.other_coord != b.other_coord)
KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash
KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id
+KHASH_MAP_INIT_STR(read_groups, int) // read group lookup
/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
if ((data = bam_aux_get(b, "ms"))) {
score = bam_aux2i(data);
} else {
- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
return -1;
}
read is leftmost of the pair. */
-static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
+static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) {
hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
int32_t this_ref, other_ref, barcode = 0;
int8_t orientation, left_read;
if ((data = bam_aux_get(bam, "MC"))) {
if (!(cig = bam_aux2Z(data))) {
- fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
+ print_error("markdup", "error, MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
return 1;
}
other_end = unclipped_other_end(bam->core.mpos, cig);
other_coord = unclipped_other_start(bam->core.mpos, cig);
} else {
- fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no MC tag. Please run samtools fixmate on file first.\n");
return 1;
}
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
}
} else {
barcode = do_hash((unsigned char *)bar, strlen(bar));
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname);
}
}
} else {
char warn_msg[256];
regerror(result, param->bc_rgx, warn_msg, 256);
- fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
}
}
}
if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
- fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n",
*warnings);
}
key->leftmost = left_read;
key->orientation = orientation;
key->barcode = barcode;
+ key->read_group = rg_num;
return 0;
}
Uses unclipped start (or end depending on orientation), reference id,
and orientation. */
-static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
+static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) {
hts_pos_t this_coord;
int32_t this_ref, barcode = 0;
int8_t orientation;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
}
} else {
barcode = do_hash((unsigned char *)bar, strlen(bar));
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname);
}
}
} else {
char warn_msg[256];
regerror(result, param->bc_rgx, warn_msg, 256);
- fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
}
}
}
if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
- fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n",
*warnings);
}
+
key->single = 1;
key->this_ref = this_ref;
key->this_coord = this_coord;
key->orientation = orientation;
key->barcode = barcode;
+ key->read_group = rg_num;
}
/* Add the duplicate name to a hash if it does not exist. */
-static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) {
+static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type, int group) {
khiter_t d;
int ret;
kh_value(d_hash, d).name = strdup(orig_name);
if (kh_value(d_hash, d).name == NULL) {
- fprintf(stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n");
+ print_error("markdup", "error, unable to allocate memory for duplicate original name.\n");
return 1;
}
} else {
}
kh_value(d_hash, d).type = type;
+ kh_value(d_hash, d).read_group = group;
} else {
- fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n");
+ print_error("markdup", "error, unable to store supplementary duplicates.\n");
free(name);
return 1;
}
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname);
+ print_error("markdup", "warning, cannot decipher read name %s for optical duplicate marking.\n", qname);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname);
+ print_error("markdup", "warning, cannot decipher x coordinate in %s .\n", qname);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname);
+ print_error("markdup", "warning, cannot decipher y coordinate in %s .\n", qname);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen);
+ print_error("markdup", "warning, x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord);
+ print_error("markdup", "warning, cannot decipher x coordinate in %s (%s).\n", qname, coord);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen);
+ print_error("markdup", "warning, y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord);
+ print_error("markdup", "warning, cannot decipher y coordinate in %s (%s).\n", qname, coord);
}
return 1;
/* Mark the read as a duplicate and update the duplicate hash (if needed) */
static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
- long *optical, long *warn) {
+ int read_group, long *optical, long *warn) {
char dup_type = 0;
long incoming_warnings = *warn;
if (param->tag) {
if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
- fprintf(stderr, "[markdup] error: unable to append 'do' tag.\n");
+ print_error("markdup", "error, unable to append 'do' tag.\n");
return -1;
}
}
}
if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) {
- fprintf(stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n",
+ print_error("markdup", "warning, %ld decipher read name warnings. New warnings will not be reported.\n",
*warn);
}
original = bam_get_qname(ori);
}
- if (add_duplicate(dup_hash, dup, original, dup_type))
+ if (add_duplicate(dup_hash, dup, original, dup_type, read_group))
return -1;
}
}
/* If the duplicate type has changed to optical then retag and duplicate hash. */
-static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
+static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, stats_block_t *stats) {
int ret = 0;
if (bam_aux_update_str(b, "dt", 3, "SQ")) {
- fprintf(stderr, "[markdup] error: unable to update 'dt' tag.\n");
+ print_error("markdup", "error, unable to update 'dt' tag.\n");
ret = -1;
}
if (paired) {
- (*optical_pair)++;
+ stats->optical++;
} else {
- (*optical_single)++;
+ stats->single_optical++;
}
if (param->supp) {
if (d == kh_end(dup_hash)) {
// error, name should already be in dup hash
- fprintf(stderr, "[markdup] error: duplicate name %s not found in hash.\n",
+ print_error("markdup", "error, duplicate name %s not found in hash.\n",
bam_get_qname(b));
ret = -1;
} else {
Returns 0 on success, >0 on coordinate reading error (program can continue) or
<0 on an error (program should not continue. */
static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
- check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
+ check_list_t *list, long *warn, stats_block_t *stats) {
int ret = 0, coord_fail = 0;
char *ori_name = bam_get_qname(ori->b);
list->size *= 2;
if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
- fprintf(stderr, "[markdup] error: Unable to expand opt check list.\n");
+ print_error("markdup", "error, Unable to expand optical check list.\n");
return -1;
}
if (old_name) {
if (strcmp(old_name, ori_name) != 0) {
if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
- fprintf(stderr, "[markdup] error: unable to update 'do' tag.\n");
+ print_error("markdup", "error, unable to update 'do' tag.\n");
ret = -1;
break;
}
}
} else {
- fprintf(stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b));
+ print_error("markdup", "error, 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b));
ret = -1;
break;
}
is_opt = optical_duplicate_partial(param, ori_name, t_beg, t_end, x, y, current->b, c, param->opt_dist, warn);
if (!c->opt && is_opt) {
- if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
ret = -1;
break;
}
if (current_paired) {
if ((c->mate_score = get_mate_score(current->b)) == -1) {
- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
ret = -1;
break;
}
if (!ret && coord_fail)
ret = coord_fail;
+ ori->dup_checked = 1;
+
return ret;
}
/* Check all the duplicates against each other to see if they are optical duplicates. */
static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
- long *warn, long *optical_single, long *optical_pair) {
+ long *warn, stats_block_t *stats) {
int ret = 0;
size_t curr = 0;
if (chk_dup) {
// the duplicate is the optical duplicate
if (!chk->opt) { // only change if not already an optical duplicate
- if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
+ if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
ret = -1;
goto fail;
}
}
} else {
if (!current->opt) {
- if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
ret = -1;
goto fail;
}
/* Where there is more than one duplicate go down the list and check for optical duplicates and change
do tags (where used) to point to original (non-duplicate) read. */
-static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
- const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
- long *optical_pair, const int check_range) {
+static int find_duplicate_chains(md_param_t *param, read_queue_t *in_read , khash_t(duplicates) *dup_hash, check_list_t *dup_list,
+ long *warn, stats_block_t *stats) {
int ret = 0;
- kliter_t(read_queue) *rq;
-
- rq = kl_begin(read_buffer);
- while (rq != kl_end(read_buffer)) {
- read_queue_t *in_read = &kl_val(rq);
+ while (in_read->original) in_read = in_read->original;
- if (check_range) {
- /* Just check against the moving window of reads based on coordinates and max read length. */
- if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
- break;
- }
- } else {
- // this is the last set of results and the end entry will be blank
- if (!bam_get_qname(in_read->b)) {
- break;
- }
+ // check against the original for tagging and optical duplication
+ if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, stats + in_read->read_group))) {
+ if (ret < 0) { // real error
+ ret = -1;
+ } else { // coordinate decoding error
+ ret = 0;
}
-
- if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
-
- // check against the original for tagging and optical duplication
- if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
- if (ret < 0) { // real error
- ret = -1;
- break;
- } else { // coordinate decoding error
- ret = 0;
- in_read->duplicate = NULL;
- continue;
- }
- }
-
- // check the rest of the duplicates against each other for optical duplication
- if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
- ret = -1;
- break;
- }
-
- in_read->duplicate = NULL;
+ } else {
+ // check the rest of the duplicates against each other for optical duplication
+ if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, stats + in_read->read_group)) {
+ ret = -1;
}
-
- rq = kl_next(rq);
}
return ret;
int i;
if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
- fprintf(stderr, "[markdup] warning: unable to calculate estimated library size.\n");
+ print_error("markdup", "warning, unable to calculate estimated library size.\n");
return estimated_size;
}
estimated_size = (unsigned long)(unique_pairs * (m + M) / 2);
} else {
- fprintf(stderr, "[markdup] warning: unable to calculate estimated library size."
+ print_error("markdup", "warning, unable to calculate estimated library size."
" Read pairs %ld should be greater than duplicate pairs %ld,"
" which should both be non zero.\n",
non_optical_pairs, duplicate_pairs);
}
+static void write_stats(FILE *fp, const char *title, const char *title_con, stats_block_t *stats) {
+ unsigned long els;
+
+ els = estimate_library_size(stats->pair, stats->duplicate, stats->optical);
+
+ if (title) {
+ fprintf(fp, "%s%s\n", title, title_con);
+ }
+
+ fprintf(fp,
+ "READ: %ld\n"
+ "WRITTEN: %ld\n"
+ "EXCLUDED: %ld\n"
+ "EXAMINED: %ld\n"
+ "PAIRED: %ld\n"
+ "SINGLE: %ld\n"
+ "DUPLICATE PAIR: %ld\n"
+ "DUPLICATE SINGLE: %ld\n"
+ "DUPLICATE PAIR OPTICAL: %ld\n"
+ "DUPLICATE SINGLE OPTICAL: %ld\n"
+ "DUPLICATE NON PRIMARY: %ld\n"
+ "DUPLICATE NON PRIMARY OPTICAL: %ld\n"
+ "DUPLICATE PRIMARY TOTAL: %ld\n"
+ "DUPLICATE TOTAL: %ld\n"
+ "ESTIMATED_LIBRARY_SIZE: %ld\n", stats->reading, stats->writing, stats->excluded, stats->examined, stats->pair, stats->single,
+ stats->duplicate, stats->single_dup, stats->optical, stats->single_optical, stats->np_duplicate, stats->np_opt_duplicate,
+ stats->single_dup + stats->duplicate, stats->single_dup + stats->duplicate + stats->np_duplicate, els);
+}
+
+
+static void write_json_stats(FILE *fp, const char *offset, const char *group_name, stats_block_t *stats, const char *end) {
+ unsigned long els;
+
+ els = estimate_library_size(stats->pair, stats->duplicate, stats->optical);
+
+ if (group_name) {
+ fprintf(fp, "%s\"READ GROUP\": \"%s\",\n", offset, group_name);
+ }
+
+ fprintf(fp, "%s\"READ\": %ld,\n", offset, stats->reading);
+ fprintf(fp, "%s\"WRITTEN\": %ld,\n", offset, stats->writing);
+ fprintf(fp, "%s\"EXCLUDED\": %ld,\n", offset, stats->excluded);
+ fprintf(fp, "%s\"EXAMINED\": %ld,\n", offset, stats->examined);
+ fprintf(fp, "%s\"PAIRED\": %ld,\n", offset, stats->pair);
+ fprintf(fp, "%s\"SINGLE\": %ld,\n", offset, stats->single);
+ fprintf(fp, "%s\"DUPLICATE PAIR\": %ld,\n", offset, stats->duplicate);
+ fprintf(fp, "%s\"DUPLICATE SINGLE\": %ld,\n", offset, stats->single_dup);
+ fprintf(fp, "%s\"DUPLICATE PAIR OPTICAL\": %ld,\n", offset, stats->optical);
+ fprintf(fp, "%s\"DUPLICATE SINGLE OPTICAL\": %ld,\n", offset, stats->single_optical);
+ fprintf(fp, "%s\"DUPLICATE NON PRIMARY\": %ld,\n", offset, stats->np_duplicate);
+ fprintf(fp, "%s\"DUPLICATE NON PRIMARY OPTICAL\": %ld,\n", offset, stats->np_opt_duplicate);
+ fprintf(fp, "%s\"DUPLICATE PRIMARY TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate);
+ fprintf(fp, "%s\"DUPLICATE TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate + stats->np_duplicate);
+ fprintf(fp, "%s\"ESTIMATED_LIBRARY_SIZE\": %ld", offset, els);
+
+ if (end) {
+ fprintf(fp, "%s", end);
+ }
+}
+
+
/* Compare the reads near each other (coordinate sorted) and try to spot the duplicates.
Generally the highest quality scoring is chosen as the original and all others the duplicates.
The score is based on the sum of the quality values (<= 15) of the read and its mate (if any).
klist_t(read_queue) *read_buffer = kl_init(read_queue);
kliter_t(read_queue) *rq;
khash_t(duplicates) *dup_hash = kh_init(duplicates);
+ khash_t(read_groups) *rg_hash = kh_init(read_groups);
int32_t prev_tid;
hts_pos_t prev_coord;
read_queue_t *in_read;
int ret;
- long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical;
- long np_duplicate, np_opt_duplicate;
+ stats_block_t *stats, *stat_array = NULL;
+ int num_groups = 0;
long opt_warnings = 0, bc_warnings = 0;
tmp_file_t temp;
char *idx_fn = NULL;
int exclude = 0;
check_list_t dup_list = {NULL, 0, 0};
- if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
- fprintf(stderr, "[markdup] out of memory\n");
+ if (!pair_hash || !single_hash || !read_buffer || !dup_hash || !rg_hash) {
+ print_error("markdup", "error, unable to allocate memory to initialise structures.\n");
goto fail;
}
if ((header = sam_hdr_read(param->in)) == NULL) {
- fprintf(stderr, "[markdup] error reading header\n");
+ print_error("markdup", "error reading header\n");
goto fail;
}
// only really works on coordinate sorted files.
kstring_t str = KS_INITIALIZE;
if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) {
- fprintf(stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n");
+ print_error("markdup", "error, queryname sorted, must be sorted by coordinate.\n");
ks_free(&str);
goto fail;
}
param->arg_list ? "CL" : NULL,
param->arg_list ? param->arg_list : NULL,
NULL) != 0) {
- fprintf(stderr, "[markdup] warning: unable to add @PG line to header.\n");
+ print_error("markdup", "warning, unable to add @PG line to header.\n");
}
if (sam_hdr_write(param->out, header) < 0) {
- fprintf(stderr, "[markdup] error writing header.\n");
+ print_error("markdup", "error writing header.\n");
goto fail;
}
if (param->write_index) {
goto fail;
}
+ if (param->read_groups) {
+ num_groups = sam_hdr_count_lines(header, "RG");
+ int g_ret = 0;
+
+ if (num_groups > 0) {
+ int i;
+
+ for (i = 0; i < num_groups; i++) {
+ const char *rg_key;
+ khiter_t rg;
+
+ rg_key = sam_hdr_line_name(header, "RG", i);
+
+ if (rg_key) {
+ rg = kh_get(read_groups, rg_hash, rg_key);
+
+ if (rg == kh_end(rg_hash)) { // new entry
+ rg = kh_put(read_groups, rg_hash, rg_key, &g_ret);
+
+ if (g_ret > 0) {
+ kh_value(rg_hash, rg) = i + 1;
+ } else {
+ print_error("markdup", "error, unable to populate read group ids. "
+ "Read groups will not be used\n");
+ g_ret = -1;
+ break;
+ }
+ } else {
+ print_error("markdup", "error, duplicate read group ids %s."
+ "Read groups will not be used\n", rg_key);
+ g_ret = -1;
+ break;
+ }
+ } else {
+ print_error("markdup", "error, Unable to retrieve read group at position %d."
+ "Read groups will not be used\n", i);
+ g_ret = -1;
+ break;
+ }
+ }
+ } else {
+ print_error("markdup", "error, no read groups found.\n");
+ g_ret = -1;
+ }
+
+ if (g_ret < 0) {
+ print_error("markdup", "error, read groups will not be used.\n");
+ param->read_groups = 0;
+ num_groups = 0;
+ }
+ }
+
+ // stat_array[0] will be for ungrouped reads
+ stat_array = calloc(num_groups + 1, sizeof(stats_block_t));
+
+ if (stat_array == NULL) {
+ print_error("markdup", "error, unable to allocate memory for stats.\n");
+ goto fail;
+ }
+
// used for coordinate order checks
prev_tid = prev_coord = 0;
// get the buffer going
in_read = kl_pushp(read_queue, read_buffer);
if (!in_read) {
- fprintf(stderr, "[markdup] out of memory\n");
+ print_error("markdup", "error, unable to allocate memory to hold reads.\n");
goto fail;
}
// handling supplementary reads needs a temporary file
if (param->supp) {
if (tmp_file_open_write(&temp, param->prefix, 1)) {
- fprintf(stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix);
+ print_error("markdup", "error, unable to open tmp file %s.\n", param->prefix);
goto fail;
}
}
if ((in_read->b = bam_init1()) == NULL) {
- fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ print_error("markdup", "error, unable to allocate memory for alignment.\n");
goto fail;
}
dup_list.c = NULL;
if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
- fprintf(stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
+ print_error("markdup", "error, unable to allocate memory for dup_list.\n");
goto fail;
}
}
- reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
- np_duplicate = np_opt_duplicate = 0;
-
while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
- int dup_checked = 0;
// do some basic coordinate order checks
if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
if (in_read->b->core.tid < prev_tid ||
((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) {
- fprintf(stderr, "[markdup] error: not in coordinate sorted order.\n");
+ print_error("markdup", "error, not in coordinate sorted order.\n");
goto fail;
}
}
in_read->pair_key.single = 1;
in_read->single_key.single = 0;
in_read->duplicate = NULL;
+ in_read->original = NULL;
in_read->dup_checked = 0;
+ in_read->read_group = 0;
- reading++;
+ if (param->read_groups) {
+ uint8_t *data;
+ char *rg;
+
+ if ((data = bam_aux_get(in_read->b, "RG"))) {
+ if ((rg = bam_aux2Z(data))) {
+ khiter_t r;
+
+ r = kh_get(read_groups, rg_hash, rg);
+
+ if (r != kh_end(rg_hash)) {
+ in_read->read_group = kh_value(rg_hash, r);
+ }
+ }
+ }
+ }
+
+ stats = stat_array + in_read->read_group;
+
+ stats->reading++;
if (param->clear && (in_read->b->core.flag & BAM_FDUP)) {
uint8_t *data;
// read must not be secondary, supplementary, unmapped or (possibly) failed QC
if (!(in_read->b->core.flag & exclude)) {
- examined++;
+ stats->examined++;
// look at the pairs first
key_data_t single_key;
in_hash_t *bp;
- if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) {
- fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
+ if (make_pair_key(param, &pair_key, in_read->b, in_read->read_group, &bc_warnings)) {
+ print_error("markdup", "error, unable to assign pair hash key.\n");
goto fail;
}
- make_single_key(param, &single_key, in_read->b, &bc_warnings);
+ make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings);
- pair++;
+ stats->pair++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
// put in singles hash for checking against non paired reads
// scores more than one read of the pair
bam1_t *dup = bp->p->b;
- if (param->check_chain)
+ if (param->check_chain) {
in_read->duplicate = bp->p;
+ bp->p->original = in_read;
+ }
bp->p = in_read;
- if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
- single_dup++;
+ stats->single_dup++;
}
} else {
- fprintf(stderr, "[markdup] error: single hashing failure.\n");
+ print_error("markdup", "error, single hashing failure for paired read.\n");
goto fail;
}
}
} else {
if ((mate_tmp = get_mate_score(bp->p->b)) == -1) {
- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
goto fail;
} else {
old_score = calc_score(bp->p->b) + mate_tmp;
}
if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
- fprintf(stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
goto fail;
} else {
new_score = calc_score(in_read->b) + mate_tmp;
} else {
in_read->duplicate = bp->p;
}
+
+ bp->p->original = in_read;
}
bp->p = in_read;
}
bp->p->duplicate = in_read;
+ in_read->original = bp->p;
}
dup = in_read->b;
}
- if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings))
goto fail;
- duplicate++;
+ stats->duplicate++;
} else {
- fprintf(stderr, "[markdup] error: pair hashing failure.\n");
+ print_error("markdup", "error, pair hashing failure.\n");
goto fail;
}
} else { // do the single (or effectively single) reads
key_data_t single_key;
in_hash_t *bp;
- make_single_key(param, &single_key, in_read->b, &bc_warnings);
+ make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings);
- single++;
+ stats->single++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
k = kh_put(reads, single_hash, single_key, &ret);
}
bp->p->duplicate = in_read;
+ in_read->original = bp->p;
}
- if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
} else {
if (new_score > old_score) { // swap reads
dup = bp->p->b;
- if (param->check_chain)
+ if (param->check_chain) {
in_read->duplicate = bp->p;
+ bp->p->original = in_read;
+ }
bp->p = in_read;
} else {
}
bp->p->duplicate = in_read;
+ in_read->original = bp->p;
}
dup = in_read->b;
}
- if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
}
- single_dup++;
+ stats->single_dup++;
} else {
- fprintf(stderr, "[markdup] error: single hashing failure.\n");
+ print_error("markdup", "error, single hashing failure for single read.\n");
goto fail;
}
}
} else {
- excluded++;
+ stats->excluded++;
}
// loop through the stored reads and write out those we
break;
}
- if (!dup_checked && param->check_chain) {
- // check for multiple optical duplicates of the same original read
-
- if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
- fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
+ if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) {
+ if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) {
+ print_error("markdup", "error, duplicate checking failed.\n");
goto fail;
}
-
- dup_checked = 1;
- }
-
-
- if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
- break;
}
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
- fprintf(stderr, "[markdup] error: writing temp output failed.\n");
+ print_error("markdup", "error, writing temp output failed.\n");
goto fail;
}
} else {
if (sam_write1(param->out, header, in_read->b) < 0) {
- fprintf(stderr, "[markdup] error: writing output failed.\n");
+ print_error("markdup", "error, writing output failed.\n");
goto fail;
}
}
- writing++;
+ stat_array[in_read->read_group].writing++;
}
// remove from hash
// set the next one up for reading
in_read = kl_pushp(read_queue, read_buffer);
if (!in_read) {
- fprintf(stderr, "[markdup] out of memory\n");
+ print_error("markdup", "error, unable to allocate memory for read in queue.\n");
goto fail;
}
if ((in_read->b = bam_init1()) == NULL) {
- fprintf(stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ print_error("markdup", "error, unable to allocate memory for alignment.\n");
goto fail;
}
}
if (ret < -1) {
- fprintf(stderr, "[markdup] error: truncated input file.\n");
+ print_error("markdup", "error, truncated input file.\n");
goto fail;
}
- // one last check
- if (param->tag || param->opt_dist) {
- if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
- fprintf(stderr, "[markdup] error: duplicate checking failed.\n");
- goto fail;
- }
- }
-
// write out the end of the list
rq = kl_begin(read_buffer);
while (rq != kl_end(read_buffer)) {
in_read = &kl_val(rq);
if (bam_get_qname(in_read->b)) { // last entry will be blank
+ if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) {
+ if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) {
+ print_error("markdup", "error, duplicate checking failed.\n");
+ goto fail;
+ }
+ }
+
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
- fprintf(stderr, "[markdup] error: writing temp output failed.\n");
+ print_error("markdup", "error, writing temp output failed on final write.\n");
goto fail;
}
} else {
if (sam_write1(param->out, header, in_read->b) < 0) {
- fprintf(stderr, "[markdup] error: writing output failed.\n");
+ print_error("markdup", "error, writing output failed on final write.\n");
goto fail;
}
}
- writing++;
+ stat_array[in_read->read_group].writing++;
}
}
bam1_t *b;
if (tmp_file_end_write(&temp)) {
- fprintf(stderr, "[markdup] error: unable to end tmp writing.\n");
+ print_error("markdup", "error, unable to end tmp writing.\n");
goto fail;
}
if (k != kh_end(dup_hash)) {
b->core.flag |= BAM_FDUP;
- np_duplicate++;
+ stat_array[kh_val(dup_hash, k).read_group].np_duplicate++;
if (param->tag && kh_val(dup_hash, k).name) {
if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
- fprintf(stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
+ print_error("markdup", "error, unable to append supplementary 'do' tag.\n");
goto fail;
}
}
if (param->opt_dist) {
if (kh_val(dup_hash, k).type) {
bam_aux_update_str(b, "dt", 3, "SQ");
- np_opt_duplicate++;
+ stat_array[kh_val(dup_hash, k).read_group].np_opt_duplicate++;
} else {
bam_aux_update_str(b, "dt", 3, "LB");
}
if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
if (sam_write1(param->out, header, b) < 0) {
- fprintf(stderr, "[markdup] error: writing final output failed.\n");
+ print_error("markdup", "error, writing final output failed.\n");
goto fail;
}
}
}
if (ret == -1) {
- fprintf(stderr, "[markdup] error: failed to read tmp file.\n");
+ print_error("markdup", "error, failed to read tmp file.\n");
goto fail;
}
}
if (opt_warnings) {
- fprintf(stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n",
+ print_error("markdup", "warning, number of failed attempts to get coordinates from read names = %ld\n",
opt_warnings);
}
if (bc_warnings) {
- fprintf(stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings);
+ print_error("markdup", "warning, number of failed attempts to get barcodes = %ld\n", bc_warnings);
}
if (param->do_stats) {
FILE *fp;
int file_open = 0;
- unsigned long els;
+ stats_block_t total;
+ int i;
if (param->stats_file) {
if (NULL == (fp = fopen(param->stats_file, "w"))) {
- fprintf(stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file);
+ print_error("markdup", "warning, cannot write stats to %s.\n", param->stats_file);
fp = stderr;
} else {
file_open = 1;
fp = stderr;
}
- els = estimate_library_size(pair, duplicate, optical);
-
- fprintf(fp,
- "COMMAND: %s\n"
- "READ: %ld\n"
- "WRITTEN: %ld\n"
- "EXCLUDED: %ld\n"
- "EXAMINED: %ld\n"
- "PAIRED: %ld\n"
- "SINGLE: %ld\n"
- "DUPLICATE PAIR: %ld\n"
- "DUPLICATE SINGLE: %ld\n"
- "DUPLICATE PAIR OPTICAL: %ld\n"
- "DUPLICATE SINGLE OPTICAL: %ld\n"
- "DUPLICATE NON PRIMARY: %ld\n"
- "DUPLICATE NON PRIMARY OPTICAL: %ld\n"
- "DUPLICATE PRIMARY TOTAL: %ld\n"
- "DUPLICATE TOTAL: %ld\n"
- "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single,
- duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate,
- single_dup + duplicate, single_dup + duplicate + np_duplicate, els);
+ total = stat_array[0];
+
+ if (param->read_groups) {
+ for (i = 1; i <= num_groups; i++) {
+ total.reading += stat_array[i].reading;
+ total.writing += stat_array[i].writing;
+ total.excluded += stat_array[i].excluded;
+ total.duplicate += stat_array[i].duplicate;
+ total.single += stat_array[i].single;
+ total.pair += stat_array[i].pair;
+ total.single_dup += stat_array[i].single_dup;
+ total.examined += stat_array[i].examined;
+ total.optical += stat_array[i].optical;
+ total.single_optical += stat_array[i].single_optical;
+ total.np_duplicate += stat_array[i].np_duplicate;
+ total.np_opt_duplicate += stat_array[i].np_opt_duplicate;
+ }
+ }
+
+ if (!param->json) {
+ write_stats(fp, "COMMAND: ", param->arg_list, &total);
+ fprintf(fp, "\n");
+
+ if (param->read_groups) {
+ if (stat_array[0].reading) {
+ write_stats(fp, "READ GROUP: ", "ungrouped", stat_array);
+ fprintf(fp, "\n");
+ }
+
+ for (i = 0; i < num_groups; i++) {
+ write_stats(fp, "READ GROUP: ", sam_hdr_line_name(header, "RG", i), stat_array + i + 1);
+ fprintf(fp, "\n");
+ }
+ }
+ } else {
+ char space4[] = " ";
+ char space8[] = " ";
+ char space12[] = " ";
+
+ fprintf(fp, "{\n");
+ fprintf(fp, "%s\"COMMAND\": \"%s\",\n", space4, param->arg_list);
+ write_json_stats(fp, space4, NULL, &total, param->read_groups ? ",\n" : "\n");
+
+ if (param->read_groups) {
+ fprintf(fp, "%s\"READ GROUPS\": [\n", space4);
+
+ if (stat_array[0].reading) {
+ fprintf(fp, "%s{\n", space8);
+ write_json_stats(fp, space12, "ungrouped", stat_array, "\n");
+ fprintf(fp, "%s},\n", space8);
+ }
+
+ for (i = 0; i < num_groups; i++) {
+ fprintf(fp, "%s{\n", space8);
+
+ write_json_stats(fp, space12, sam_hdr_line_name(header, "RG", i), stat_array + i + 1, "\n");
+
+ if (i < num_groups -1 ) {
+ fprintf(fp, "%s},\n", space8);
+ } else {
+ fprintf(fp, "%s}\n", space8);
+ }
+ }
+
+ fprintf(fp, "%s]\n", space4);
+ }
+
+ fprintf(fp, "}\n");
+ }
if (file_open) {
fclose(fp);
if (param->write_index) {
if (sam_idx_save(param->out) < 0) {
- print_error_errno("markdup", "writing index failed");
+ print_error_errno("markdup", "error, writing index failed");
goto fail;
}
}
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
kl_destroy(read_queue, read_buffer);
kh_destroy(duplicates, dup_hash);
+ kh_destroy(read_groups, rg_hash);
sam_hdr_destroy(header);
return 0;
}
}
kh_destroy(duplicates, dup_hash);
+ kh_destroy(read_groups, rg_hash);
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
sam_hdr_destroy(header);
fprintf(stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n");
fprintf(stderr, " -s Report stats.\n");
fprintf(stderr, " -f NAME Write stats to named file. Implies -s.\n");
+ fprintf(stderr, " --json Output stats in JSON. Also implies -s\n");
fprintf(stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n");
fprintf(stderr, " -d INT Optical distance (if set, marks with dt tag)\n");
fprintf(stderr, " -c Clear previous duplicate settings and tags.\n");
fprintf(stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n");
fprintf(stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n");
fprintf(stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n");
+ fprintf(stderr, " --use-read-groups Use the read group tags in duplicate matching.\n");
fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
int bam_markdup(int argc, char **argv) {
int c, ret, bc_name = 0;
- char wmode[4] = {'w', 'b', 0, 0};
+ char wmode[4] = {'w', 0, 0, 0};
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
kstring_t tmpprefix = {0, 0, NULL};
char *regex = NULL, *bc_regex = NULL;
char *regex_order = "txy";
md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL};
+ 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"barcode-tag", required_argument, NULL, 1006},
{"barcode-name", no_argument, NULL, 1007},
{"barcode-rgx", required_argument, NULL, 1008},
+ {"use-read-groups", no_argument, NULL, 1009},
+ {"json", no_argument, NULL, 1010},
{NULL, 0, NULL, 0}
};
} else if (strcmp(optarg, "s") == 0) {
param.mode = MD_MODE_SEQUENCE;
} else {
- fprintf(stderr, "[markdup] error: unknown mode '%s'.\n", optarg);
+ print_error("markdup", "error, unknown mode '%s'.\n", optarg);
return markdup_usage();
}
break;
- case 'u': wmode[2] = '0'; break;
+ case 'u': wmode[1] = '0'; break;
case 1001: param.include_fails = 1; break;
case 1002: param.no_pg = 1; break;
case 1003: param.check_chain = 0; break;
case 1006: param.barcode = optarg; break;
case 1007: bc_name = 1; break;
case 1008: bc_name = 1, bc_regex = optarg; break;
+ case 1009: param.read_groups = 1; break;
+ case 1010: param.json = 1; param.do_stats = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
return markdup_usage();
if (param.barcode && bc_name) {
- fprintf(stderr, "[markdup] Error: cannot specify --barcode-tag and "
+ print_error("markdup", "error, cannot specify --barcode-tag and "
"--barcode-name (or --barcode-rgx) at same time.\n");
return 1;
}
param.rgx_y = 2;
param.rgx_t = 0;
} else {
- fprintf(stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order);
+ print_error("markdup", "error, could not recognise regex coordinate order \"%s\".\n", regex_order);
return 1;
}
if ((param.rgx = malloc(sizeof(regex_t))) == NULL) {
- fprintf(stderr, "[markdup] error: could not allocate memory for regex.\n");
+ print_error("markdup", "error, could not allocate memory for regex.\n");
return 1;
}
char err_msg[256];
regerror(result, param.rgx, err_msg, 256);
- fprintf(stderr, "[markdup] error: regex error \"%s\"\n", err_msg);
+ print_error("markdup", "error, regex fail \"%s\"\n", err_msg);
free(param.rgx);
return 1;
}
/* From Illumina UMI documentation: "The UMI sequence is located in the
eighth colon-delimited field of the read name (QNAME)". */
- char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)";
+ char *rgx = "[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:([!-?A-~]+)";
if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) {
- fprintf(stderr, "[markdup] error: could not allocate memory for barcode regex.\n");
+ print_error("markdup", "error, could not allocate memory for barcode regex.\n");
return 1;
}
char err_msg[256];
regerror(result, param.bc_rgx, err_msg, 256);
- fprintf(stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg);
+ print_error("markdup", "error, barcode regex fail \"%s\"\n", err_msg);
free(param.bc_rgx);
return 1;
}
param.in = sam_open_format(argv[optind], "r", &ga.in);
if (!param.in) {
- print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]);
+ print_error_errno("markdup", "error, failed to open \"%s\" for input", argv[optind]);
return 1;
}
- sam_open_mode(wmode + 1, argv[optind + 1], NULL);
+ strcat(wmode, "b"); // default if unknown suffix
+ sam_open_mode(wmode + strlen(wmode)-1, argv[optind + 1], NULL);
param.out = sam_open_format(argv[optind + 1], wmode, &ga.out);
if (!param.out) {
- print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]);
+ print_error_errno("markdup", "error, failed to open \"%s\" for output", argv[optind + 1]);
return 1;
}
if (ga.nthreads > 0) {
if (!(p.pool = hts_tpool_init(ga.nthreads))) {
- fprintf(stderr, "[markdup] error creating thread pool\n");
+ print_error("markdup", "error creating thread pool.\n");
return 1;
}
sam_close(param.in);
if (sam_close(param.out) < 0) {
- fprintf(stderr, "[markdup] error closing output file\n");
+ print_error("markdup", "error closing output file.\n");
ret = 1;
}
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2022 Genome Research Ltd.
+ Copyright (C) 2017-2023 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
int rgx_t;
char *barcode;
regex_t *bc_rgx;
+ int read_groups;
+ int json;
} md_param_t;
typedef struct {
int32_t this_ref;
int32_t other_ref;
int32_t barcode;
+ int32_t read_group;
int8_t single;
int8_t leftmost;
int8_t orientation;
key_data_t single_key;
bam1_t *b;
struct read_queue_s *duplicate;
+ struct read_queue_s *original;
hts_pos_t pos;
int dup_checked;
+ int read_group;
} read_queue_t;
typedef struct {
typedef struct {
char *name;
char type;
+ int read_group;
} dup_map_t;
typedef struct {
int end;
} check_t;
-
typedef struct {
check_t *c;
size_t size;
size_t length;
} check_list_t;
+typedef struct {
+ long reading;
+ long writing;
+ long excluded;
+ long duplicate;
+ long single;
+ long pair;
+ long single_dup;
+ long examined;
+ long optical;
+ long single_optical;
+ long np_duplicate;
+ long np_opt_duplicate;
+} stats_block_t;
+
static khint32_t do_hash(unsigned char *key, khint32_t len);
static khint_t hash_key(key_data_t key) {
khint_t hash;
if (key.single) {
- unsigned char sig[17];
+ unsigned char sig[21];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.orientation, 1); i += 1;
memcpy(sig + i, &key.barcode, 4); i += 4;
+ memcpy(sig + i, &key.read_group, 4); i += 4;
hash = do_hash(sig, i);
} else {
- unsigned char sig[30];
+ unsigned char sig[34];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.leftmost, 1); i += 1;
memcpy(sig + i, &key.orientation, 1); i += 1;
memcpy(sig + i, &key.barcode, 4); i += 4;
+ memcpy(sig + i, &key.read_group, 4); i += 4;
hash = do_hash(sig, i);
}
match = 0;
else if (a.barcode != b.barcode)
match = 0;
+ else if (a.read_group != b.read_group)
+ match = 0;
if (!a.single) {
if (a.other_coord != b.other_coord)
KHASH_INIT(reads, key_data_t, in_hash_t, 1, hash_key, key_equal) // read map hash
KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id
+KHASH_MAP_INIT_STR(read_groups, int) // read group lookup
/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
if ((data = bam_aux_get(b, "ms"))) {
score = bam_aux2i(data);
} else {
- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
return -1;
}
read is leftmost of the pair. */
-static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
+static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) {
hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
int32_t this_ref, other_ref, barcode = 0;
int8_t orientation, left_read;
if ((data = bam_aux_get(bam, "MC"))) {
if (!(cig = bam_aux2Z(data))) {
- fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
+ print_error("markdup", "error, MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
return 1;
}
other_end = unclipped_other_end(bam->core.mpos, cig);
other_coord = unclipped_other_start(bam->core.mpos, cig);
} else {
- fprintf(samtools_stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no MC tag. Please run samtools fixmate on file first.\n");
return 1;
}
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
}
} else {
barcode = do_hash((unsigned char *)bar, strlen(bar));
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname);
}
}
} else {
char warn_msg[256];
regerror(result, param->bc_rgx, warn_msg, 256);
- fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
}
}
}
if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
- fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n",
*warnings);
}
key->leftmost = left_read;
key->orientation = orientation;
key->barcode = barcode;
+ key->read_group = rg_num;
return 0;
}
Uses unclipped start (or end depending on orientation), reference id,
and orientation. */
-static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
+static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, int rg_num, long *warnings) {
hts_pos_t this_coord;
int32_t this_ref, barcode = 0;
int8_t orientation;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ print_error("markdup", "warning, %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
}
} else {
barcode = do_hash((unsigned char *)bar, strlen(bar));
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ print_error("markdup", "warning, barcode regex unable to match substring on %s.\n", qname);
}
}
} else {
char warn_msg[256];
regerror(result, param->bc_rgx, warn_msg, 256);
- fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ print_error("markdup", "warning, barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
}
}
}
if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
- fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ print_error("markdup", "warning, %ld barcode read warnings. New warnings will not be reported.\n",
*warnings);
}
+
key->single = 1;
key->this_ref = this_ref;
key->this_coord = this_coord;
key->orientation = orientation;
key->barcode = barcode;
+ key->read_group = rg_num;
}
/* Add the duplicate name to a hash if it does not exist. */
-static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type) {
+static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe, char *orig_name, char type, int group) {
khiter_t d;
int ret;
kh_value(d_hash, d).name = strdup(orig_name);
if (kh_value(d_hash, d).name == NULL) {
- fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for duplicate original name.\n");
+ print_error("markdup", "error, unable to allocate memory for duplicate original name.\n");
return 1;
}
} else {
}
kh_value(d_hash, d).type = type;
+ kh_value(d_hash, d).read_group = group;
} else {
- fprintf(samtools_stderr, "[markdup] error: unable to store supplementary duplicates.\n");
+ print_error("markdup", "error, unable to store supplementary duplicates.\n");
free(name);
return 1;
}
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname);
+ print_error("markdup", "warning, cannot decipher read name %s for optical duplicate marking.\n", qname);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname);
+ print_error("markdup", "warning, cannot decipher x coordinate in %s .\n", qname);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname);
+ print_error("markdup", "warning, cannot decipher y coordinate in %s .\n", qname);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen);
+ print_error("markdup", "warning, x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord);
+ print_error("markdup", "warning, cannot decipher x coordinate in %s (%s).\n", qname, coord);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen);
+ print_error("markdup", "warning, y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen);
}
return 1;
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord);
+ print_error("markdup", "warning, cannot decipher y coordinate in %s (%s).\n", qname, coord);
}
return 1;
/* Mark the read as a duplicate and update the duplicate hash (if needed) */
static int mark_duplicates(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *ori, bam1_t *dup,
- long *optical, long *warn) {
+ int read_group, long *optical, long *warn) {
char dup_type = 0;
long incoming_warnings = *warn;
if (param->tag) {
if (bam_aux_update_str(dup, "do", strlen(bam_get_qname(ori)) + 1, bam_get_qname(ori))) {
- fprintf(samtools_stderr, "[markdup] error: unable to append 'do' tag.\n");
+ print_error("markdup", "error, unable to append 'do' tag.\n");
return -1;
}
}
}
if ((*warn == BMD_WARNING_MAX) && (incoming_warnings != *warn)) {
- fprintf(samtools_stderr, "[markdup] warning: %ld decipher read name warnings. New warnings will not be reported.\n",
+ print_error("markdup", "warning, %ld decipher read name warnings. New warnings will not be reported.\n",
*warn);
}
original = bam_get_qname(ori);
}
- if (add_duplicate(dup_hash, dup, original, dup_type))
+ if (add_duplicate(dup_hash, dup, original, dup_type, read_group))
return -1;
}
}
/* If the duplicate type has changed to optical then retag and duplicate hash. */
-static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, long *optical_single, long *optical_pair) {
+static inline int optical_retag(md_param_t *param, khash_t(duplicates) *dup_hash, bam1_t *b, int paired, stats_block_t *stats) {
int ret = 0;
if (bam_aux_update_str(b, "dt", 3, "SQ")) {
- fprintf(samtools_stderr, "[markdup] error: unable to update 'dt' tag.\n");
+ print_error("markdup", "error, unable to update 'dt' tag.\n");
ret = -1;
}
if (paired) {
- (*optical_pair)++;
+ stats->optical++;
} else {
- (*optical_single)++;
+ stats->single_optical++;
}
if (param->supp) {
if (d == kh_end(dup_hash)) {
// error, name should already be in dup hash
- fprintf(samtools_stderr, "[markdup] error: duplicate name %s not found in hash.\n",
+ print_error("markdup", "error, duplicate name %s not found in hash.\n",
bam_get_qname(b));
ret = -1;
} else {
Returns 0 on success, >0 on coordinate reading error (program can continue) or
<0 on an error (program should not continue. */
static int check_chain_against_original(md_param_t *param, khash_t(duplicates) *dup_hash, read_queue_t *ori,
- check_list_t *list, long *warn, long *optical_single, long *optical_pair) {
+ check_list_t *list, long *warn, stats_block_t *stats) {
int ret = 0, coord_fail = 0;
char *ori_name = bam_get_qname(ori->b);
list->size *= 2;
if (!(tmp = realloc(list->c, list->size * sizeof(check_t)))) {
- fprintf(samtools_stderr, "[markdup] error: Unable to expand opt check list.\n");
+ print_error("markdup", "error, Unable to expand optical check list.\n");
return -1;
}
if (old_name) {
if (strcmp(old_name, ori_name) != 0) {
if (bam_aux_update_str(current->b, "do", strlen(ori_name) + 1, (const char *)ori_name)) {
- fprintf(samtools_stderr, "[markdup] error: unable to update 'do' tag.\n");
+ print_error("markdup", "error, unable to update 'do' tag.\n");
ret = -1;
break;
}
}
} else {
- fprintf(samtools_stderr, "[markdup] error: 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b));
+ print_error("markdup", "error, 'do' tag has wrong type for read %s.\n", bam_get_qname(current->b));
ret = -1;
break;
}
is_opt = optical_duplicate_partial(param, ori_name, t_beg, t_end, x, y, current->b, c, param->opt_dist, warn);
if (!c->opt && is_opt) {
- if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
ret = -1;
break;
}
if (current_paired) {
if ((c->mate_score = get_mate_score(current->b)) == -1) {
- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
ret = -1;
break;
}
if (!ret && coord_fail)
ret = coord_fail;
+ ori->dup_checked = 1;
+
return ret;
}
/* Check all the duplicates against each other to see if they are optical duplicates. */
static int check_duplicate_chain(md_param_t *param, khash_t(duplicates) *dup_hash, check_list_t *list,
- long *warn, long *optical_single, long *optical_pair) {
+ long *warn, stats_block_t *stats) {
int ret = 0;
size_t curr = 0;
if (chk_dup) {
// the duplicate is the optical duplicate
if (!chk->opt) { // only change if not already an optical duplicate
- if (optical_retag(param, dup_hash, chk->b, chk_paired, optical_single, optical_pair)) {
+ if (optical_retag(param, dup_hash, chk->b, chk_paired, stats)) {
ret = -1;
goto fail;
}
}
} else {
if (!current->opt) {
- if (optical_retag(param, dup_hash, current->b, current_paired, optical_single, optical_pair)) {
+ if (optical_retag(param, dup_hash, current->b, current_paired, stats)) {
ret = -1;
goto fail;
}
/* Where there is more than one duplicate go down the list and check for optical duplicates and change
do tags (where used) to point to original (non-duplicate) read. */
-static int find_duplicate_chains(md_param_t *param, klist_t(read_queue) *read_buffer, khash_t(duplicates) *dup_hash, check_list_t *dup_list,
- const hts_pos_t prev_coord, const int32_t prev_tid, long *warn, long *optical_single,
- long *optical_pair, const int check_range) {
+static int find_duplicate_chains(md_param_t *param, read_queue_t *in_read , khash_t(duplicates) *dup_hash, check_list_t *dup_list,
+ long *warn, stats_block_t *stats) {
int ret = 0;
- kliter_t(read_queue) *rq;
-
- rq = kl_begin(read_buffer);
- while (rq != kl_end(read_buffer)) {
- read_queue_t *in_read = &kl_val(rq);
+ while (in_read->original) in_read = in_read->original;
- if (check_range) {
- /* Just check against the moving window of reads based on coordinates and max read length. */
- if (in_read->pos + param->max_length > prev_coord && in_read->b->core.tid == prev_tid && (prev_tid != -1 || prev_coord != -1)) {
- break;
- }
- } else {
- // this is the last set of results and the end entry will be blank
- if (!bam_get_qname(in_read->b)) {
- break;
- }
+ // check against the original for tagging and optical duplication
+ if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, stats + in_read->read_group))) {
+ if (ret < 0) { // real error
+ ret = -1;
+ } else { // coordinate decoding error
+ ret = 0;
}
-
- if (!(in_read->b->core.flag & BAM_FDUP) && in_read->duplicate) { // is the head of a duplicate chain
-
- // check against the original for tagging and optical duplication
- if ((ret = check_chain_against_original(param, dup_hash, in_read, dup_list, warn, optical_single, optical_pair))) {
- if (ret < 0) { // real error
- ret = -1;
- break;
- } else { // coordinate decoding error
- ret = 0;
- in_read->duplicate = NULL;
- continue;
- }
- }
-
- // check the rest of the duplicates against each other for optical duplication
- if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, optical_single, optical_pair)) {
- ret = -1;
- break;
- }
-
- in_read->duplicate = NULL;
+ } else {
+ // check the rest of the duplicates against each other for optical duplication
+ if (param->opt_dist && check_duplicate_chain(param, dup_hash, dup_list, warn, stats + in_read->read_group)) {
+ ret = -1;
}
-
- rq = kl_next(rq);
}
return ret;
int i;
if (coverage_equation(m * (double)unique_pairs, (double)unique_pairs, (double)non_optical_pairs) < 0) {
- fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size.\n");
+ print_error("markdup", "warning, unable to calculate estimated library size.\n");
return estimated_size;
}
estimated_size = (unsigned long)(unique_pairs * (m + M) / 2);
} else {
- fprintf(samtools_stderr, "[markdup] warning: unable to calculate estimated library size."
+ print_error("markdup", "warning, unable to calculate estimated library size."
" Read pairs %ld should be greater than duplicate pairs %ld,"
" which should both be non zero.\n",
non_optical_pairs, duplicate_pairs);
}
+static void write_stats(FILE *fp, const char *title, const char *title_con, stats_block_t *stats) {
+ unsigned long els;
+
+ els = estimate_library_size(stats->pair, stats->duplicate, stats->optical);
+
+ if (title) {
+ fprintf(fp, "%s%s\n", title, title_con);
+ }
+
+ fprintf(fp,
+ "READ: %ld\n"
+ "WRITTEN: %ld\n"
+ "EXCLUDED: %ld\n"
+ "EXAMINED: %ld\n"
+ "PAIRED: %ld\n"
+ "SINGLE: %ld\n"
+ "DUPLICATE PAIR: %ld\n"
+ "DUPLICATE SINGLE: %ld\n"
+ "DUPLICATE PAIR OPTICAL: %ld\n"
+ "DUPLICATE SINGLE OPTICAL: %ld\n"
+ "DUPLICATE NON PRIMARY: %ld\n"
+ "DUPLICATE NON PRIMARY OPTICAL: %ld\n"
+ "DUPLICATE PRIMARY TOTAL: %ld\n"
+ "DUPLICATE TOTAL: %ld\n"
+ "ESTIMATED_LIBRARY_SIZE: %ld\n", stats->reading, stats->writing, stats->excluded, stats->examined, stats->pair, stats->single,
+ stats->duplicate, stats->single_dup, stats->optical, stats->single_optical, stats->np_duplicate, stats->np_opt_duplicate,
+ stats->single_dup + stats->duplicate, stats->single_dup + stats->duplicate + stats->np_duplicate, els);
+}
+
+
+static void write_json_stats(FILE *fp, const char *offset, const char *group_name, stats_block_t *stats, const char *end) {
+ unsigned long els;
+
+ els = estimate_library_size(stats->pair, stats->duplicate, stats->optical);
+
+ if (group_name) {
+ fprintf(fp, "%s\"READ GROUP\": \"%s\",\n", offset, group_name);
+ }
+
+ fprintf(fp, "%s\"READ\": %ld,\n", offset, stats->reading);
+ fprintf(fp, "%s\"WRITTEN\": %ld,\n", offset, stats->writing);
+ fprintf(fp, "%s\"EXCLUDED\": %ld,\n", offset, stats->excluded);
+ fprintf(fp, "%s\"EXAMINED\": %ld,\n", offset, stats->examined);
+ fprintf(fp, "%s\"PAIRED\": %ld,\n", offset, stats->pair);
+ fprintf(fp, "%s\"SINGLE\": %ld,\n", offset, stats->single);
+ fprintf(fp, "%s\"DUPLICATE PAIR\": %ld,\n", offset, stats->duplicate);
+ fprintf(fp, "%s\"DUPLICATE SINGLE\": %ld,\n", offset, stats->single_dup);
+ fprintf(fp, "%s\"DUPLICATE PAIR OPTICAL\": %ld,\n", offset, stats->optical);
+ fprintf(fp, "%s\"DUPLICATE SINGLE OPTICAL\": %ld,\n", offset, stats->single_optical);
+ fprintf(fp, "%s\"DUPLICATE NON PRIMARY\": %ld,\n", offset, stats->np_duplicate);
+ fprintf(fp, "%s\"DUPLICATE NON PRIMARY OPTICAL\": %ld,\n", offset, stats->np_opt_duplicate);
+ fprintf(fp, "%s\"DUPLICATE PRIMARY TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate);
+ fprintf(fp, "%s\"DUPLICATE TOTAL\": %ld,\n", offset, stats->single_dup + stats->duplicate + stats->np_duplicate);
+ fprintf(fp, "%s\"ESTIMATED_LIBRARY_SIZE\": %ld", offset, els);
+
+ if (end) {
+ fprintf(fp, "%s", end);
+ }
+}
+
+
/* Compare the reads near each other (coordinate sorted) and try to spot the duplicates.
Generally the highest quality scoring is chosen as the original and all others the duplicates.
The score is based on the sum of the quality values (<= 15) of the read and its mate (if any).
klist_t(read_queue) *read_buffer = kl_init(read_queue);
kliter_t(read_queue) *rq;
khash_t(duplicates) *dup_hash = kh_init(duplicates);
+ khash_t(read_groups) *rg_hash = kh_init(read_groups);
int32_t prev_tid;
hts_pos_t prev_coord;
read_queue_t *in_read;
int ret;
- long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical;
- long np_duplicate, np_opt_duplicate;
+ stats_block_t *stats, *stat_array = NULL;
+ int num_groups = 0;
long opt_warnings = 0, bc_warnings = 0;
tmp_file_t temp;
char *idx_fn = NULL;
int exclude = 0;
check_list_t dup_list = {NULL, 0, 0};
- if (!pair_hash || !single_hash || !read_buffer || !dup_hash) {
- fprintf(samtools_stderr, "[markdup] out of memory\n");
+ if (!pair_hash || !single_hash || !read_buffer || !dup_hash || !rg_hash) {
+ print_error("markdup", "error, unable to allocate memory to initialise structures.\n");
goto fail;
}
if ((header = sam_hdr_read(param->in)) == NULL) {
- fprintf(samtools_stderr, "[markdup] error reading header\n");
+ print_error("markdup", "error reading header\n");
goto fail;
}
// only really works on coordinate sorted files.
kstring_t str = KS_INITIALIZE;
if (!sam_hdr_find_tag_hd(header, "SO", &str) && str.s && !strcmp(str.s, "queryname")) {
- fprintf(samtools_stderr, "[markdup] error: queryname sorted, must be sorted by coordinate.\n");
+ print_error("markdup", "error, queryname sorted, must be sorted by coordinate.\n");
ks_free(&str);
goto fail;
}
param->arg_list ? "CL" : NULL,
param->arg_list ? param->arg_list : NULL,
NULL) != 0) {
- fprintf(samtools_stderr, "[markdup] warning: unable to add @PG line to header.\n");
+ print_error("markdup", "warning, unable to add @PG line to header.\n");
}
if (sam_hdr_write(param->out, header) < 0) {
- fprintf(samtools_stderr, "[markdup] error writing header.\n");
+ print_error("markdup", "error writing header.\n");
goto fail;
}
if (param->write_index) {
goto fail;
}
+ if (param->read_groups) {
+ num_groups = sam_hdr_count_lines(header, "RG");
+ int g_ret = 0;
+
+ if (num_groups > 0) {
+ int i;
+
+ for (i = 0; i < num_groups; i++) {
+ const char *rg_key;
+ khiter_t rg;
+
+ rg_key = sam_hdr_line_name(header, "RG", i);
+
+ if (rg_key) {
+ rg = kh_get(read_groups, rg_hash, rg_key);
+
+ if (rg == kh_end(rg_hash)) { // new entry
+ rg = kh_put(read_groups, rg_hash, rg_key, &g_ret);
+
+ if (g_ret > 0) {
+ kh_value(rg_hash, rg) = i + 1;
+ } else {
+ print_error("markdup", "error, unable to populate read group ids. "
+ "Read groups will not be used\n");
+ g_ret = -1;
+ break;
+ }
+ } else {
+ print_error("markdup", "error, duplicate read group ids %s."
+ "Read groups will not be used\n", rg_key);
+ g_ret = -1;
+ break;
+ }
+ } else {
+ print_error("markdup", "error, Unable to retrieve read group at position %d."
+ "Read groups will not be used\n", i);
+ g_ret = -1;
+ break;
+ }
+ }
+ } else {
+ print_error("markdup", "error, no read groups found.\n");
+ g_ret = -1;
+ }
+
+ if (g_ret < 0) {
+ print_error("markdup", "error, read groups will not be used.\n");
+ param->read_groups = 0;
+ num_groups = 0;
+ }
+ }
+
+ // stat_array[0] will be for ungrouped reads
+ stat_array = calloc(num_groups + 1, sizeof(stats_block_t));
+
+ if (stat_array == NULL) {
+ print_error("markdup", "error, unable to allocate memory for stats.\n");
+ goto fail;
+ }
+
// used for coordinate order checks
prev_tid = prev_coord = 0;
// get the buffer going
in_read = kl_pushp(read_queue, read_buffer);
if (!in_read) {
- fprintf(samtools_stderr, "[markdup] out of memory\n");
+ print_error("markdup", "error, unable to allocate memory to hold reads.\n");
goto fail;
}
// handling supplementary reads needs a temporary file
if (param->supp) {
if (tmp_file_open_write(&temp, param->prefix, 1)) {
- fprintf(samtools_stderr, "[markdup] error: unable to open tmp file %s.\n", param->prefix);
+ print_error("markdup", "error, unable to open tmp file %s.\n", param->prefix);
goto fail;
}
}
if ((in_read->b = bam_init1()) == NULL) {
- fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ print_error("markdup", "error, unable to allocate memory for alignment.\n");
goto fail;
}
dup_list.c = NULL;
if ((dup_list.c = malloc(dup_list.size * sizeof(check_t))) == NULL) {
- fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for dup_list.\n");
+ print_error("markdup", "error, unable to allocate memory for dup_list.\n");
goto fail;
}
}
- reading = writing = excluded = single_dup = duplicate = examined = pair = single = optical = single_optical = 0;
- np_duplicate = np_opt_duplicate = 0;
-
while ((ret = sam_read1(param->in, header, in_read->b)) >= 0) {
- int dup_checked = 0;
// do some basic coordinate order checks
if (in_read->b->core.tid >= 0) { // -1 for unmapped reads
if (in_read->b->core.tid < prev_tid ||
((in_read->b->core.tid == prev_tid) && (in_read->b->core.pos < prev_coord))) {
- fprintf(samtools_stderr, "[markdup] error: not in coordinate sorted order.\n");
+ print_error("markdup", "error, not in coordinate sorted order.\n");
goto fail;
}
}
in_read->pair_key.single = 1;
in_read->single_key.single = 0;
in_read->duplicate = NULL;
+ in_read->original = NULL;
in_read->dup_checked = 0;
+ in_read->read_group = 0;
- reading++;
+ if (param->read_groups) {
+ uint8_t *data;
+ char *rg;
+
+ if ((data = bam_aux_get(in_read->b, "RG"))) {
+ if ((rg = bam_aux2Z(data))) {
+ khiter_t r;
+
+ r = kh_get(read_groups, rg_hash, rg);
+
+ if (r != kh_end(rg_hash)) {
+ in_read->read_group = kh_value(rg_hash, r);
+ }
+ }
+ }
+ }
+
+ stats = stat_array + in_read->read_group;
+
+ stats->reading++;
if (param->clear && (in_read->b->core.flag & BAM_FDUP)) {
uint8_t *data;
// read must not be secondary, supplementary, unmapped or (possibly) failed QC
if (!(in_read->b->core.flag & exclude)) {
- examined++;
+ stats->examined++;
// look at the pairs first
key_data_t single_key;
in_hash_t *bp;
- if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) {
- fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n");
+ if (make_pair_key(param, &pair_key, in_read->b, in_read->read_group, &bc_warnings)) {
+ print_error("markdup", "error, unable to assign pair hash key.\n");
goto fail;
}
- make_single_key(param, &single_key, in_read->b, &bc_warnings);
+ make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings);
- pair++;
+ stats->pair++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
// put in singles hash for checking against non paired reads
// scores more than one read of the pair
bam1_t *dup = bp->p->b;
- if (param->check_chain)
+ if (param->check_chain) {
in_read->duplicate = bp->p;
+ bp->p->original = in_read;
+ }
bp->p = in_read;
- if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
- single_dup++;
+ stats->single_dup++;
}
} else {
- fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n");
+ print_error("markdup", "error, single hashing failure for paired read.\n");
goto fail;
}
}
} else {
if ((mate_tmp = get_mate_score(bp->p->b)) == -1) {
- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
goto fail;
} else {
old_score = calc_score(bp->p->b) + mate_tmp;
}
if ((mate_tmp = get_mate_score(in_read->b)) == -1) {
- fprintf(samtools_stderr, "[markdup] error: no ms score tag. Please run samtools fixmate on file first.\n");
+ print_error("markdup", "error, no ms score tag. Please run samtools fixmate on file first.\n");
goto fail;
} else {
new_score = calc_score(in_read->b) + mate_tmp;
} else {
in_read->duplicate = bp->p;
}
+
+ bp->p->original = in_read;
}
bp->p = in_read;
}
bp->p->duplicate = in_read;
+ in_read->original = bp->p;
}
dup = in_read->b;
}
- if (mark_duplicates(param, dup_hash, bp->p->b, dup, &optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->optical, &opt_warnings))
goto fail;
- duplicate++;
+ stats->duplicate++;
} else {
- fprintf(samtools_stderr, "[markdup] error: pair hashing failure.\n");
+ print_error("markdup", "error, pair hashing failure.\n");
goto fail;
}
} else { // do the single (or effectively single) reads
key_data_t single_key;
in_hash_t *bp;
- make_single_key(param, &single_key, in_read->b, &bc_warnings);
+ make_single_key(param, &single_key, in_read->b, in_read->read_group, &bc_warnings);
- single++;
+ stats->single++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
k = kh_put(reads, single_hash, single_key, &ret);
}
bp->p->duplicate = in_read;
+ in_read->original = bp->p;
}
- if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, &single_optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, in_read->b, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
} else {
if (new_score > old_score) { // swap reads
dup = bp->p->b;
- if (param->check_chain)
+ if (param->check_chain) {
in_read->duplicate = bp->p;
+ bp->p->original = in_read;
+ }
bp->p = in_read;
} else {
}
bp->p->duplicate = in_read;
+ in_read->original = bp->p;
}
dup = in_read->b;
}
- if (mark_duplicates(param, dup_hash, bp->p->b, dup, &single_optical, &opt_warnings))
+ if (mark_duplicates(param, dup_hash, bp->p->b, dup, in_read->read_group, &stats->single_optical, &opt_warnings))
goto fail;
}
- single_dup++;
+ stats->single_dup++;
} else {
- fprintf(samtools_stderr, "[markdup] error: single hashing failure.\n");
+ print_error("markdup", "error, single hashing failure for single read.\n");
goto fail;
}
}
} else {
- excluded++;
+ stats->excluded++;
}
// loop through the stored reads and write out those we
break;
}
- if (!dup_checked && param->check_chain) {
- // check for multiple optical duplicates of the same original read
-
- if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 1)) {
- fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n");
+ if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) {
+ if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) {
+ print_error("markdup", "error, duplicate checking failed.\n");
goto fail;
}
-
- dup_checked = 1;
- }
-
-
- if (param->check_chain && (in_read->b->core.flag & BAM_FDUP) && !in_read->dup_checked && !(in_read->b->core.flag & exclude)) {
- break;
}
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
- fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n");
+ print_error("markdup", "error, writing temp output failed.\n");
goto fail;
}
} else {
if (sam_write1(param->out, header, in_read->b) < 0) {
- fprintf(samtools_stderr, "[markdup] error: writing output failed.\n");
+ print_error("markdup", "error, writing output failed.\n");
goto fail;
}
}
- writing++;
+ stat_array[in_read->read_group].writing++;
}
// remove from hash
// set the next one up for reading
in_read = kl_pushp(read_queue, read_buffer);
if (!in_read) {
- fprintf(samtools_stderr, "[markdup] out of memory\n");
+ print_error("markdup", "error, unable to allocate memory for read in queue.\n");
goto fail;
}
if ((in_read->b = bam_init1()) == NULL) {
- fprintf(samtools_stderr, "[markdup] error: unable to allocate memory for alignment.\n");
+ print_error("markdup", "error, unable to allocate memory for alignment.\n");
goto fail;
}
}
if (ret < -1) {
- fprintf(samtools_stderr, "[markdup] error: truncated input file.\n");
+ print_error("markdup", "error, truncated input file.\n");
goto fail;
}
- // one last check
- if (param->tag || param->opt_dist) {
- if (find_duplicate_chains(param, read_buffer, dup_hash, &dup_list, prev_coord, prev_tid, &opt_warnings, &single_optical, &optical, 0)) {
- fprintf(samtools_stderr, "[markdup] error: duplicate checking failed.\n");
- goto fail;
- }
- }
-
// write out the end of the list
rq = kl_begin(read_buffer);
while (rq != kl_end(read_buffer)) {
in_read = &kl_val(rq);
if (bam_get_qname(in_read->b)) { // last entry will be blank
+ if (param->check_chain && !in_read->dup_checked && (in_read->original || in_read->duplicate)) {
+ if (find_duplicate_chains(param, in_read, dup_hash, &dup_list, &opt_warnings, stat_array)) {
+ print_error("markdup", "error, duplicate checking failed.\n");
+ goto fail;
+ }
+ }
+
if (!param->remove_dups || !(in_read->b->core.flag & BAM_FDUP)) {
if (param->supp) {
if (tmp_file_write(&temp, in_read->b)) {
- fprintf(samtools_stderr, "[markdup] error: writing temp output failed.\n");
+ print_error("markdup", "error, writing temp output failed on final write.\n");
goto fail;
}
} else {
if (sam_write1(param->out, header, in_read->b) < 0) {
- fprintf(samtools_stderr, "[markdup] error: writing output failed.\n");
+ print_error("markdup", "error, writing output failed on final write.\n");
goto fail;
}
}
- writing++;
+ stat_array[in_read->read_group].writing++;
}
}
bam1_t *b;
if (tmp_file_end_write(&temp)) {
- fprintf(samtools_stderr, "[markdup] error: unable to end tmp writing.\n");
+ print_error("markdup", "error, unable to end tmp writing.\n");
goto fail;
}
if (k != kh_end(dup_hash)) {
b->core.flag |= BAM_FDUP;
- np_duplicate++;
+ stat_array[kh_val(dup_hash, k).read_group].np_duplicate++;
if (param->tag && kh_val(dup_hash, k).name) {
if (bam_aux_update_str(b, "do", strlen(kh_val(dup_hash, k).name) + 1, (char*)kh_val(dup_hash, k).name)) {
- fprintf(samtools_stderr, "[markdup] error: unable to append supplementary 'do' tag.\n");
+ print_error("markdup", "error, unable to append supplementary 'do' tag.\n");
goto fail;
}
}
if (param->opt_dist) {
if (kh_val(dup_hash, k).type) {
bam_aux_update_str(b, "dt", 3, "SQ");
- np_opt_duplicate++;
+ stat_array[kh_val(dup_hash, k).read_group].np_opt_duplicate++;
} else {
bam_aux_update_str(b, "dt", 3, "LB");
}
if (!param->remove_dups || !(b->core.flag & BAM_FDUP)) {
if (sam_write1(param->out, header, b) < 0) {
- fprintf(samtools_stderr, "[markdup] error: writing final output failed.\n");
+ print_error("markdup", "error, writing final output failed.\n");
goto fail;
}
}
}
if (ret == -1) {
- fprintf(samtools_stderr, "[markdup] error: failed to read tmp file.\n");
+ print_error("markdup", "error, failed to read tmp file.\n");
goto fail;
}
}
if (opt_warnings) {
- fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get coordinates from read names = %ld\n",
+ print_error("markdup", "warning, number of failed attempts to get coordinates from read names = %ld\n",
opt_warnings);
}
if (bc_warnings) {
- fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings);
+ print_error("markdup", "warning, number of failed attempts to get barcodes = %ld\n", bc_warnings);
}
if (param->do_stats) {
FILE *fp;
int file_open = 0;
- unsigned long els;
+ stats_block_t total;
+ int i;
if (param->stats_file) {
if (NULL == (fp = fopen(param->stats_file, "w"))) {
- fprintf(samtools_stderr, "[markdup] warning: cannot write stats to %s.\n", param->stats_file);
+ print_error("markdup", "warning, cannot write stats to %s.\n", param->stats_file);
fp = samtools_stderr;
} else {
file_open = 1;
fp = samtools_stderr;
}
- els = estimate_library_size(pair, duplicate, optical);
-
- fprintf(fp,
- "COMMAND: %s\n"
- "READ: %ld\n"
- "WRITTEN: %ld\n"
- "EXCLUDED: %ld\n"
- "EXAMINED: %ld\n"
- "PAIRED: %ld\n"
- "SINGLE: %ld\n"
- "DUPLICATE PAIR: %ld\n"
- "DUPLICATE SINGLE: %ld\n"
- "DUPLICATE PAIR OPTICAL: %ld\n"
- "DUPLICATE SINGLE OPTICAL: %ld\n"
- "DUPLICATE NON PRIMARY: %ld\n"
- "DUPLICATE NON PRIMARY OPTICAL: %ld\n"
- "DUPLICATE PRIMARY TOTAL: %ld\n"
- "DUPLICATE TOTAL: %ld\n"
- "ESTIMATED_LIBRARY_SIZE: %ld\n", param->arg_list, reading, writing, excluded, examined, pair, single,
- duplicate, single_dup, optical, single_optical, np_duplicate, np_opt_duplicate,
- single_dup + duplicate, single_dup + duplicate + np_duplicate, els);
+ total = stat_array[0];
+
+ if (param->read_groups) {
+ for (i = 1; i <= num_groups; i++) {
+ total.reading += stat_array[i].reading;
+ total.writing += stat_array[i].writing;
+ total.excluded += stat_array[i].excluded;
+ total.duplicate += stat_array[i].duplicate;
+ total.single += stat_array[i].single;
+ total.pair += stat_array[i].pair;
+ total.single_dup += stat_array[i].single_dup;
+ total.examined += stat_array[i].examined;
+ total.optical += stat_array[i].optical;
+ total.single_optical += stat_array[i].single_optical;
+ total.np_duplicate += stat_array[i].np_duplicate;
+ total.np_opt_duplicate += stat_array[i].np_opt_duplicate;
+ }
+ }
+
+ if (!param->json) {
+ write_stats(fp, "COMMAND: ", param->arg_list, &total);
+ fprintf(fp, "\n");
+
+ if (param->read_groups) {
+ if (stat_array[0].reading) {
+ write_stats(fp, "READ GROUP: ", "ungrouped", stat_array);
+ fprintf(fp, "\n");
+ }
+
+ for (i = 0; i < num_groups; i++) {
+ write_stats(fp, "READ GROUP: ", sam_hdr_line_name(header, "RG", i), stat_array + i + 1);
+ fprintf(fp, "\n");
+ }
+ }
+ } else {
+ char space4[] = " ";
+ char space8[] = " ";
+ char space12[] = " ";
+
+ fprintf(fp, "{\n");
+ fprintf(fp, "%s\"COMMAND\": \"%s\",\n", space4, param->arg_list);
+ write_json_stats(fp, space4, NULL, &total, param->read_groups ? ",\n" : "\n");
+
+ if (param->read_groups) {
+ fprintf(fp, "%s\"READ GROUPS\": [\n", space4);
+
+ if (stat_array[0].reading) {
+ fprintf(fp, "%s{\n", space8);
+ write_json_stats(fp, space12, "ungrouped", stat_array, "\n");
+ fprintf(fp, "%s},\n", space8);
+ }
+
+ for (i = 0; i < num_groups; i++) {
+ fprintf(fp, "%s{\n", space8);
+
+ write_json_stats(fp, space12, sam_hdr_line_name(header, "RG", i), stat_array + i + 1, "\n");
+
+ if (i < num_groups -1 ) {
+ fprintf(fp, "%s},\n", space8);
+ } else {
+ fprintf(fp, "%s}\n", space8);
+ }
+ }
+
+ fprintf(fp, "%s]\n", space4);
+ }
+
+ fprintf(fp, "}\n");
+ }
if (file_open) {
fclose(fp);
if (param->write_index) {
if (sam_idx_save(param->out) < 0) {
- print_error_errno("markdup", "writing index failed");
+ print_error_errno("markdup", "error, writing index failed");
goto fail;
}
}
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
kl_destroy(read_queue, read_buffer);
kh_destroy(duplicates, dup_hash);
+ kh_destroy(read_groups, rg_hash);
sam_hdr_destroy(header);
return 0;
}
}
kh_destroy(duplicates, dup_hash);
+ kh_destroy(read_groups, rg_hash);
if (param->check_chain && (param->tag || param->opt_dist))
free(dup_list.c);
+ free(stat_array);
kh_destroy(reads, pair_hash);
kh_destroy(reads, single_hash);
sam_hdr_destroy(header);
fprintf(samtools_stderr, " -S Mark supplementary alignments of duplicates as duplicates (slower).\n");
fprintf(samtools_stderr, " -s Report stats.\n");
fprintf(samtools_stderr, " -f NAME Write stats to named file. Implies -s.\n");
+ fprintf(samtools_stderr, " --json Output stats in JSON. Also implies -s\n");
fprintf(samtools_stderr, " -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp.\n");
fprintf(samtools_stderr, " -d INT Optical distance (if set, marks with dt tag)\n");
fprintf(samtools_stderr, " -c Clear previous duplicate settings and tags.\n");
fprintf(samtools_stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n");
fprintf(samtools_stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n");
fprintf(samtools_stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n");
+ fprintf(samtools_stderr, " --use-read-groups Use the read group tags in duplicate matching.\n");
fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
int bam_markdup(int argc, char **argv) {
int c, ret, bc_name = 0;
- char wmode[4] = {'w', 'b', 0, 0};
+ char wmode[4] = {'w', 0, 0, 0};
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
kstring_t tmpprefix = {0, 0, NULL};
char *regex = NULL, *bc_regex = NULL;
char *regex_order = "txy";
md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL};
+ 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL, 0, 0};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"barcode-tag", required_argument, NULL, 1006},
{"barcode-name", no_argument, NULL, 1007},
{"barcode-rgx", required_argument, NULL, 1008},
+ {"use-read-groups", no_argument, NULL, 1009},
+ {"json", no_argument, NULL, 1010},
{NULL, 0, NULL, 0}
};
} else if (strcmp(optarg, "s") == 0) {
param.mode = MD_MODE_SEQUENCE;
} else {
- fprintf(samtools_stderr, "[markdup] error: unknown mode '%s'.\n", optarg);
+ print_error("markdup", "error, unknown mode '%s'.\n", optarg);
return markdup_usage();
}
break;
- case 'u': wmode[2] = '0'; break;
+ case 'u': wmode[1] = '0'; break;
case 1001: param.include_fails = 1; break;
case 1002: param.no_pg = 1; break;
case 1003: param.check_chain = 0; break;
case 1006: param.barcode = optarg; break;
case 1007: bc_name = 1; break;
case 1008: bc_name = 1, bc_regex = optarg; break;
+ case 1009: param.read_groups = 1; break;
+ case 1010: param.json = 1; param.do_stats = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
return markdup_usage();
if (param.barcode && bc_name) {
- fprintf(samtools_stderr, "[markdup] Error: cannot specify --barcode-tag and "
+ print_error("markdup", "error, cannot specify --barcode-tag and "
"--barcode-name (or --barcode-rgx) at same time.\n");
return 1;
}
param.rgx_y = 2;
param.rgx_t = 0;
} else {
- fprintf(samtools_stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order);
+ print_error("markdup", "error, could not recognise regex coordinate order \"%s\".\n", regex_order);
return 1;
}
if ((param.rgx = malloc(sizeof(regex_t))) == NULL) {
- fprintf(samtools_stderr, "[markdup] error: could not allocate memory for regex.\n");
+ print_error("markdup", "error, could not allocate memory for regex.\n");
return 1;
}
char err_msg[256];
regerror(result, param.rgx, err_msg, 256);
- fprintf(samtools_stderr, "[markdup] error: regex error \"%s\"\n", err_msg);
+ print_error("markdup", "error, regex fail \"%s\"\n", err_msg);
free(param.rgx);
return 1;
}
/* From Illumina UMI documentation: "The UMI sequence is located in the
eighth colon-delimited field of the read name (QNAME)". */
- char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)";
+ char *rgx = "[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:[0-9A-Za-z]+:([!-?A-~]+)";
if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) {
- fprintf(samtools_stderr, "[markdup] error: could not allocate memory for barcode regex.\n");
+ print_error("markdup", "error, could not allocate memory for barcode regex.\n");
return 1;
}
char err_msg[256];
regerror(result, param.bc_rgx, err_msg, 256);
- fprintf(samtools_stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg);
+ print_error("markdup", "error, barcode regex fail \"%s\"\n", err_msg);
free(param.bc_rgx);
return 1;
}
param.in = sam_open_format(argv[optind], "r", &ga.in);
if (!param.in) {
- print_error_errno("markdup", "failed to open \"%s\" for input", argv[optind]);
+ print_error_errno("markdup", "error, failed to open \"%s\" for input", argv[optind]);
return 1;
}
- sam_open_mode(wmode + 1, argv[optind + 1], NULL);
+ strcat(wmode, "b"); // default if unknown suffix
+ sam_open_mode(wmode + strlen(wmode)-1, argv[optind + 1], NULL);
param.out = sam_open_format(argv[optind + 1], wmode, &ga.out);
if (!param.out) {
- print_error_errno("markdup", "failed to open \"%s\" for output", argv[optind + 1]);
+ print_error_errno("markdup", "error, failed to open \"%s\" for output", argv[optind + 1]);
return 1;
}
if (ga.nthreads > 0) {
if (!(p.pool = hts_tpool_init(ga.nthreads))) {
- fprintf(samtools_stderr, "[markdup] error creating thread pool\n");
+ print_error("markdup", "error creating thread pool.\n");
return 1;
}
sam_close(param.in);
if (sam_close(param.out) < 0) {
- fprintf(samtools_stderr, "[markdup] error closing output file\n");
+ print_error("markdup", "error closing output file.\n");
ret = 1;
}
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2017, 2019, 2022 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
return 0;
}
+// Completely delete the CIGAR field
+static void clear_cigar(bam1_t *b) {
+ memmove(bam_get_cigar(b), bam_get_seq(b),
+ b->data + b->l_data - bam_get_seq(b));
+ b->l_data -= 4*b->core.n_cigar;
+ b->core.n_cigar = 0;
+}
+
+// Trim a CIGAR field to end on reference position "end". Remaining bases
+// are turned to soft clips.
+static int bam_trim(bam1_t *b, hts_pos_t end) {
+ hts_pos_t pos = b->core.pos;
+ int n_cigar = b->core.n_cigar, i;
+ uint32_t new_cigar_a[1024];
+ uint32_t *new_cigar = new_cigar_a;
+ uint32_t *cigar = bam_get_cigar(b);
+
+ // Find end of alignment or end of ref
+ int op = 0, oplen = 0;
+ for (i = 0; i < n_cigar; i++) {
+ op = bam_cigar_op(cigar[i]);
+ oplen = bam_cigar_oplen(cigar[i]);
+ if (!(bam_cigar_type(op) & 2))
+ continue;
+ pos += oplen;
+ if (pos > end)
+ break;
+ }
+
+ if (i == n_cigar)
+ // looks fine already
+ return 0;
+
+ int old_i = i, j = 0;
+ // At worst we grow by 1 element (eg 100M -> 70M30S)
+ if (n_cigar-i >= 1024-1) {
+ new_cigar = malloc(4*(n_cigar-i+1));
+ if (!new_cigar)
+ return -1;
+ }
+
+ // We fill out to new_cigar from here on.
+ if (pos-oplen < end) {
+ // Partial CIGAR op? Split existing tag.
+ cigar[old_i++] = bam_cigar_gen(end - (pos-oplen), op);
+ new_cigar[j++] = bam_cigar_gen(pos-end, BAM_CSOFT_CLIP);
+ } else if (pos > end) {
+ // entirely off the chromosome; this will trigger CIGAR *, MQUAL 0
+ b->core.flag |= BAM_FUNMAP;
+ b->core.flag &= ~BAM_FPROPER_PAIR;
+ } else {
+ // CIGAR op started on the trim junction
+ new_cigar[j++] = bam_cigar_gen(oplen, BAM_CSOFT_CLIP);
+ }
+
+ // Replace trailing elements.
+ for (i++; i < n_cigar; i++) {
+ op = bam_cigar_op(cigar[i]);
+ oplen = bam_cigar_oplen(cigar[i]);
+ if (op == BAM_CHARD_CLIP) {
+ new_cigar[j++] = cigar[i];
+ } else {
+ new_cigar[j-1] =
+ bam_cigar_gen(bam_cigar_oplen(new_cigar[j-1]) + oplen,
+ BAM_CSOFT_CLIP);
+ }
+ }
+
+ // We now have cigar[0..old_i-1] for existing CIGAR
+ // and new_cigar[0..j-1] for new CIGAR trailing component.
+
+ if (old_i+j == n_cigar) {
+ // Fits and no data move needed
+ memcpy(&cigar[old_i], new_cigar, j*4);
+ } else {
+ uint8_t *seq_old = bam_get_seq(b);
+ uint8_t *aux_end = b->data + b->l_data;
+ int nshift;
+ if (old_i+j < n_cigar) {
+ // Smaller, and can move data down
+ nshift = -4*(n_cigar - (old_i+j));
+ } else {
+ // Bigger, so grow BAM and move data up
+ nshift = 4*(old_i+j - n_cigar);
+ // FIXME: make htslib's sam_realloc_bam_data public
+ if (b->l_data + nshift > b->m_data) {
+ uint8_t *new_data = realloc(b->data, b->l_data + nshift);
+ if (!new_data) {
+ if (new_cigar != new_cigar_a)
+ free(new_cigar);
+ return -1;
+ }
+ b->m_data = b->l_data + nshift;
+ if (b->data != new_data) {
+ b->data = new_data;
+ seq_old = bam_get_seq(b);
+ aux_end = b->data + b->l_data;
+ cigar = bam_get_cigar(b);
+ }
+ }
+ }
+ memmove(seq_old+nshift, seq_old, aux_end - seq_old);
+ b->l_data += nshift;
+ memcpy(&cigar[old_i], new_cigar, j*4);
+ b->core.n_cigar = old_i+j;
+ }
+
+ if (new_cigar != new_cigar_a)
+ free(new_cigar);
+
+ return 0;
+}
+
+// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux"
+// keywords for the bam sanitizer.
+int bam_sanitize_options(const char *str) {
+ int opt = 0;
+
+ while (str && *str) {
+ const char *str_start;
+ while(*str && *str == ',')
+ str++;
+
+ for (str_start = str; *str && *str != ','; str++);
+ int len = str - str_start;
+ if (strncmp(str_start, "all", 3) == 0 || *str_start == '*')
+ opt = FIX_ALL;
+ else if (strncmp(str_start, "none", 4) == 0 ||
+ strncmp(str_start, "off", 3) == 0)
+ opt = 0;
+ else if (strncmp(str_start, "on", 2) == 0)
+ // default for position sorted data
+ opt = FIX_MQUAL | FIX_UNMAP | FIX_CIGAR | FIX_AUX;
+ else if (strncmp(str_start, "pos", 3) == 0)
+ opt |= FIX_POS;
+ else if (strncmp(str_start, "mqual", 5) == 0)
+ opt |= FIX_MQUAL;
+ else if (strncmp(str_start, "unmap", 5) == 0)
+ opt |= FIX_UNMAP;
+ else if (strncmp(str_start, "cigar", 5) == 0)
+ opt |= FIX_CIGAR;
+ else if (strncmp(str_start, "aux", 3) == 0)
+ opt |= FIX_AUX;
+ else {
+ print_error("sanitize", "Unrecognised keyword %.*s\n",
+ len, str_start);
+ return -1;
+ }
+ }
+
+ return opt;
+}
+
+int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) {
+ if ((flags & FIX_POS) && b->core.tid < 0) {
+ // RNAME * => pos 0. NB can break alignment chr/pos sort order
+ b->core.pos = -1;
+ if (flags & FIX_UNMAP)
+ b->core.flag |= BAM_FUNMAP;
+ }
+
+ if ((flags & FIX_CIGAR) && !(b->core.flag & BAM_FUNMAP)) {
+ // Mapped => unmapped correction
+ if (b->core.pos < 0 && (flags & FIX_UNMAP)) {
+ b->core.flag |= BAM_FUNMAP;
+ } else {
+ hts_pos_t cur_end, rlen = sam_hdr_tid2len(h, b->core.tid);
+ if (b->core.pos >= rlen && (flags & FIX_UNMAP)) {
+ b->core.flag |= BAM_FUNMAP;
+ if (flags & FIX_POS)
+ b->core.tid = b->core.pos = -1;
+ } else if ((cur_end = bam_endpos(b)) > rlen) {
+ if (bam_trim(b, rlen) < 0)
+ return -1;
+ }
+ }
+ }
+
+ if (b->core.flag & BAM_FUNMAP) {
+ // Unmapped -> cigar/qual correctoins
+ if ((flags & FIX_CIGAR) && b->core.n_cigar > 0)
+ clear_cigar(b);
+
+ if (flags & FIX_MQUAL)
+ b->core.qual = 0;
+
+ // Remove NM, MD, CG, SM tags.
+ if (flags & FIX_AUX) {
+ uint8_t *from = bam_aux_first(b);
+ uint8_t *end = b->data + b->l_data;
+ uint8_t *to = from ? from-2 : end;
+
+#define XTAG(a) (((a)[0]<<8) + (a)[1])
+ while (from) {
+ uint8_t *next = bam_aux_next(b, from);
+ if (!next && errno != ENOENT)
+ return -1;
+
+ // Keep tag unless one of a specific set.
+ // NB "to" always points to an aux tag start, while
+ // "from" is after key.
+ from -= 2;
+ int key = (int)from[0]<<8 | from[1];
+ if (key != XTAG("NM") && key != XTAG("MD") &&
+ key != XTAG("CG") && key != XTAG("SM")) {
+ ptrdiff_t len = (next ? next-2 : end) - from;
+ if (from != to)
+ memmove(to, from, len);
+ to += len;
+ }
+ from = next;
+ }
+ b->l_data = to - b->data;
+ }
+ }
+
+ return 0;
+}
+
// currently, this function ONLY works if each read has one hit
-static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg)
+static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
+ int proper_pair_check, int add_ct,
+ int do_mate_scoring, char *arg_list, int no_pg,
+ int sanitize_flags)
{
sam_hdr_t *header;
bam1_t *b[2] = { NULL, NULL };
curr = 0; has_prev = 0;
while ((result = sam_read1(in, header, b[curr])) >= 0) {
bam1_t *cur = b[curr], *pre = b[1-curr];
+ if (bam_sanitize(header, cur, sanitize_flags) < 0)
+ goto fail;
if (cur->core.flag & BAM_FSECONDARY)
{
if ( !remove_reads ) {
if (sam_write1(out, header, cur) < 0) goto write_fail;
continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
}
- if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
- {
- cur->core.flag |= BAM_FUNMAP;
- }
if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
{
cur_end = bam_endpos(cur);
-
- // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag
- if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP;
}
+
if (has_prev) { // do we have a pair of reads to examine?
if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
pre->core.flag |= BAM_FPAIRED;
}
has_prev = 0;
} else { // unpaired? clear bad info and write it out
- if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
- pre->core.flag |= BAM_FUNMAP;
- pre->core.tid = -1;
- pre->core.pos = -1;
- }
pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
" -c Add template cigar ct tag\n"
" -m Add mate score tag\n"
" -u Uncompressed output\n"
+" -z, --sanitize FLAG[,FLAG]\n"
+" Sanitize alignment fields [defaults to all types]\n"
" --no-PG do not add a PG line\n");
sam_global_opt_help(where, "-.O..@-.");
{
htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1,
+ mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[4] = {'w', 'b', 0, 0};
static const struct option lopts[] = {
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcmO:@:uz:", lopts, NULL)) >= 0) {
switch (c) {
- case 'r': remove_reads = 1; break;
- case 'p': proper_pair_check = 0; break;
- case 'c': add_ct = 1; break;
- case 'm': mate_score = 1; break;
- case 'u': wmode[2] = '0'; break;
- case 1: no_pg = 1; break;
- default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': usage(stderr); goto fail;
+ case 'r': remove_reads = 1; break;
+ case 'p': proper_pair_check = 0; break;
+ case 'c': add_ct = 1; break;
+ case 'm': mate_score = 1; break;
+ case 'u': wmode[2] = '0'; break;
+ case 1: no_pg = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage(stderr); goto fail;
+ case 'z':
+ if ((sanitize_flags = bam_sanitize_options(optarg)) < 0)
+ exit(1);
+ break;
}
}
if (optind+1 >= argc) { usage(stderr); goto fail; }
}
// run
- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg);
+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct,
+ mate_score, arg_list, no_pg, sanitize_flags);
// cleanup
sam_close(in);
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2017, 2019 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2017, 2019, 2022 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
return 0;
}
+// Completely delete the CIGAR field
+static void clear_cigar(bam1_t *b) {
+ memmove(bam_get_cigar(b), bam_get_seq(b),
+ b->data + b->l_data - bam_get_seq(b));
+ b->l_data -= 4*b->core.n_cigar;
+ b->core.n_cigar = 0;
+}
+
+// Trim a CIGAR field to end on reference position "end". Remaining bases
+// are turned to soft clips.
+static int bam_trim(bam1_t *b, hts_pos_t end) {
+ hts_pos_t pos = b->core.pos;
+ int n_cigar = b->core.n_cigar, i;
+ uint32_t new_cigar_a[1024];
+ uint32_t *new_cigar = new_cigar_a;
+ uint32_t *cigar = bam_get_cigar(b);
+
+ // Find end of alignment or end of ref
+ int op = 0, oplen = 0;
+ for (i = 0; i < n_cigar; i++) {
+ op = bam_cigar_op(cigar[i]);
+ oplen = bam_cigar_oplen(cigar[i]);
+ if (!(bam_cigar_type(op) & 2))
+ continue;
+ pos += oplen;
+ if (pos > end)
+ break;
+ }
+
+ if (i == n_cigar)
+ // looks fine already
+ return 0;
+
+ int old_i = i, j = 0;
+ // At worst we grow by 1 element (eg 100M -> 70M30S)
+ if (n_cigar-i >= 1024-1) {
+ new_cigar = malloc(4*(n_cigar-i+1));
+ if (!new_cigar)
+ return -1;
+ }
+
+ // We fill out to new_cigar from here on.
+ if (pos-oplen < end) {
+ // Partial CIGAR op? Split existing tag.
+ cigar[old_i++] = bam_cigar_gen(end - (pos-oplen), op);
+ new_cigar[j++] = bam_cigar_gen(pos-end, BAM_CSOFT_CLIP);
+ } else if (pos > end) {
+ // entirely off the chromosome; this will trigger CIGAR *, MQUAL 0
+ b->core.flag |= BAM_FUNMAP;
+ b->core.flag &= ~BAM_FPROPER_PAIR;
+ } else {
+ // CIGAR op started on the trim junction
+ new_cigar[j++] = bam_cigar_gen(oplen, BAM_CSOFT_CLIP);
+ }
+
+ // Replace trailing elements.
+ for (i++; i < n_cigar; i++) {
+ op = bam_cigar_op(cigar[i]);
+ oplen = bam_cigar_oplen(cigar[i]);
+ if (op == BAM_CHARD_CLIP) {
+ new_cigar[j++] = cigar[i];
+ } else {
+ new_cigar[j-1] =
+ bam_cigar_gen(bam_cigar_oplen(new_cigar[j-1]) + oplen,
+ BAM_CSOFT_CLIP);
+ }
+ }
+
+ // We now have cigar[0..old_i-1] for existing CIGAR
+ // and new_cigar[0..j-1] for new CIGAR trailing component.
+
+ if (old_i+j == n_cigar) {
+ // Fits and no data move needed
+ memcpy(&cigar[old_i], new_cigar, j*4);
+ } else {
+ uint8_t *seq_old = bam_get_seq(b);
+ uint8_t *aux_end = b->data + b->l_data;
+ int nshift;
+ if (old_i+j < n_cigar) {
+ // Smaller, and can move data down
+ nshift = -4*(n_cigar - (old_i+j));
+ } else {
+ // Bigger, so grow BAM and move data up
+ nshift = 4*(old_i+j - n_cigar);
+ // FIXME: make htslib's sam_realloc_bam_data public
+ if (b->l_data + nshift > b->m_data) {
+ uint8_t *new_data = realloc(b->data, b->l_data + nshift);
+ if (!new_data) {
+ if (new_cigar != new_cigar_a)
+ free(new_cigar);
+ return -1;
+ }
+ b->m_data = b->l_data + nshift;
+ if (b->data != new_data) {
+ b->data = new_data;
+ seq_old = bam_get_seq(b);
+ aux_end = b->data + b->l_data;
+ cigar = bam_get_cigar(b);
+ }
+ }
+ }
+ memmove(seq_old+nshift, seq_old, aux_end - seq_old);
+ b->l_data += nshift;
+ memcpy(&cigar[old_i], new_cigar, j*4);
+ b->core.n_cigar = old_i+j;
+ }
+
+ if (new_cigar != new_cigar_a)
+ free(new_cigar);
+
+ return 0;
+}
+
+// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux"
+// keywords for the bam sanitizer.
+int bam_sanitize_options(const char *str) {
+ int opt = 0;
+
+ while (str && *str) {
+ const char *str_start;
+ while(*str && *str == ',')
+ str++;
+
+ for (str_start = str; *str && *str != ','; str++);
+ int len = str - str_start;
+ if (strncmp(str_start, "all", 3) == 0 || *str_start == '*')
+ opt = FIX_ALL;
+ else if (strncmp(str_start, "none", 4) == 0 ||
+ strncmp(str_start, "off", 3) == 0)
+ opt = 0;
+ else if (strncmp(str_start, "on", 2) == 0)
+ // default for position sorted data
+ opt = FIX_MQUAL | FIX_UNMAP | FIX_CIGAR | FIX_AUX;
+ else if (strncmp(str_start, "pos", 3) == 0)
+ opt |= FIX_POS;
+ else if (strncmp(str_start, "mqual", 5) == 0)
+ opt |= FIX_MQUAL;
+ else if (strncmp(str_start, "unmap", 5) == 0)
+ opt |= FIX_UNMAP;
+ else if (strncmp(str_start, "cigar", 5) == 0)
+ opt |= FIX_CIGAR;
+ else if (strncmp(str_start, "aux", 3) == 0)
+ opt |= FIX_AUX;
+ else {
+ print_error("sanitize", "Unrecognised keyword %.*s\n",
+ len, str_start);
+ return -1;
+ }
+ }
+
+ return opt;
+}
+
+int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags) {
+ if ((flags & FIX_POS) && b->core.tid < 0) {
+ // RNAME * => pos 0. NB can break alignment chr/pos sort order
+ b->core.pos = -1;
+ if (flags & FIX_UNMAP)
+ b->core.flag |= BAM_FUNMAP;
+ }
+
+ if ((flags & FIX_CIGAR) && !(b->core.flag & BAM_FUNMAP)) {
+ // Mapped => unmapped correction
+ if (b->core.pos < 0 && (flags & FIX_UNMAP)) {
+ b->core.flag |= BAM_FUNMAP;
+ } else {
+ hts_pos_t cur_end, rlen = sam_hdr_tid2len(h, b->core.tid);
+ if (b->core.pos >= rlen && (flags & FIX_UNMAP)) {
+ b->core.flag |= BAM_FUNMAP;
+ if (flags & FIX_POS)
+ b->core.tid = b->core.pos = -1;
+ } else if ((cur_end = bam_endpos(b)) > rlen) {
+ if (bam_trim(b, rlen) < 0)
+ return -1;
+ }
+ }
+ }
+
+ if (b->core.flag & BAM_FUNMAP) {
+ // Unmapped -> cigar/qual correctoins
+ if ((flags & FIX_CIGAR) && b->core.n_cigar > 0)
+ clear_cigar(b);
+
+ if (flags & FIX_MQUAL)
+ b->core.qual = 0;
+
+ // Remove NM, MD, CG, SM tags.
+ if (flags & FIX_AUX) {
+ uint8_t *from = bam_aux_first(b);
+ uint8_t *end = b->data + b->l_data;
+ uint8_t *to = from ? from-2 : end;
+
+#define XTAG(a) (((a)[0]<<8) + (a)[1])
+ while (from) {
+ uint8_t *next = bam_aux_next(b, from);
+ if (!next && errno != ENOENT)
+ return -1;
+
+ // Keep tag unless one of a specific set.
+ // NB "to" always points to an aux tag start, while
+ // "from" is after key.
+ from -= 2;
+ int key = (int)from[0]<<8 | from[1];
+ if (key != XTAG("NM") && key != XTAG("MD") &&
+ key != XTAG("CG") && key != XTAG("SM")) {
+ ptrdiff_t len = (next ? next-2 : end) - from;
+ if (from != to)
+ memmove(to, from, len);
+ to += len;
+ }
+ from = next;
+ }
+ b->l_data = to - b->data;
+ }
+ }
+
+ return 0;
+}
+
// currently, this function ONLY works if each read has one hit
-static int bam_mating_core(samFile *in, samFile *out, int remove_reads, int proper_pair_check, int add_ct, int do_mate_scoring, char *arg_list, int no_pg)
+static int bam_mating_core(samFile *in, samFile *out, int remove_reads,
+ int proper_pair_check, int add_ct,
+ int do_mate_scoring, char *arg_list, int no_pg,
+ int sanitize_flags)
{
sam_hdr_t *header;
bam1_t *b[2] = { NULL, NULL };
curr = 0; has_prev = 0;
while ((result = sam_read1(in, header, b[curr])) >= 0) {
bam1_t *cur = b[curr], *pre = b[1-curr];
+ if (bam_sanitize(header, cur, sanitize_flags) < 0)
+ goto fail;
if (cur->core.flag & BAM_FSECONDARY)
{
if ( !remove_reads ) {
if (sam_write1(out, header, cur) < 0) goto write_fail;
continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
}
- if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
- {
- cur->core.flag |= BAM_FUNMAP;
- }
if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
{
cur_end = bam_endpos(cur);
-
- // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag
- if (cur_end > sam_hdr_tid2len(header, cur->core.tid)) cur->core.flag |= BAM_FUNMAP;
}
+
if (has_prev) { // do we have a pair of reads to examine?
if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
pre->core.flag |= BAM_FPAIRED;
}
has_prev = 0;
} else { // unpaired? clear bad info and write it out
- if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
- pre->core.flag |= BAM_FUNMAP;
- pre->core.tid = -1;
- pre->core.pos = -1;
- }
pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) {
" -c Add template cigar ct tag\n"
" -m Add mate score tag\n"
" -u Uncompressed output\n"
+" -z, --sanitize FLAG[,FLAG]\n"
+" Sanitize alignment fields [defaults to all types]\n"
" --no-PG do not add a PG line\n");
sam_global_opt_help(where, "-.O..@-.");
{
htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
- int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1, mate_score = 0, no_pg = 0;
+ int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1,
+ mate_score = 0, no_pg = 0, sanitize_flags = FIX_ALL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[4] = {'w', 'b', 0, 0};
static const struct option lopts[] = {
// parse args
if (argc == 1) { usage(samtools_stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcmO:@:u", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcmO:@:uz:", lopts, NULL)) >= 0) {
switch (c) {
- case 'r': remove_reads = 1; break;
- case 'p': proper_pair_check = 0; break;
- case 'c': add_ct = 1; break;
- case 'm': mate_score = 1; break;
- case 'u': wmode[2] = '0'; break;
- case 1: no_pg = 1; break;
- default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': usage(samtools_stderr); goto fail;
+ case 'r': remove_reads = 1; break;
+ case 'p': proper_pair_check = 0; break;
+ case 'c': add_ct = 1; break;
+ case 'm': mate_score = 1; break;
+ case 'u': wmode[2] = '0'; break;
+ case 1: no_pg = 1; break;
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage(samtools_stderr); goto fail;
+ case 'z':
+ if ((sanitize_flags = bam_sanitize_options(optarg)) < 0)
+ samtools_exit(1);
+ break;
}
}
if (optind+1 >= argc) { usage(samtools_stderr); goto fail; }
}
// run
- res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct, mate_score, arg_list, no_pg);
+ res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct,
+ mate_score, arg_list, no_pg, sanitize_flags);
// cleanup
sam_close(in);
/* bam_md.c -- calmd subcommand.
- Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
+ Copyright (C) 2009-2011, 2014-2015, 2019-2020, 2022 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <ctype.h>
#include <limits.h>
#include <errno.h>
+#include <assert.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
#define UPDATE_MD 16
#define HASH_QNM 32
+typedef struct cached_ref_entry {
+ char *ref;
+ hts_pos_t len;
+} cached_ref_entry;
+
+typedef struct ref_cache {
+ cached_ref_entry *refs;
+ char *last_ref;
+ hts_pos_t last_len;
+ int nref;
+ int last_tid;
+} ref_cache;
+
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
}
+// Get a new reference sequence.
+// For position-sorted inputs, the previous reference should never be
+// needed again and can be discarded to save memory. For other orderings,
+// references are stored in a cache in case they're required in the future.
+// The caching mode is turned on if the requested tid is less than the last
+// one used, indicating the file ordering doesn't match the sequence dictionary.
+static int get_ref(faidx_t *fai, sam_hdr_t *header, ref_cache *cache,
+ int tid, char **ref_out, const char **ref_name_out,
+ hts_pos_t *len_out)
+{
+ char *ref = NULL;
+ const char *ref_name;
+ hts_pos_t len = 0;
+
+ // This should only be called when tid changes
+ assert(tid != cache->last_tid);
+
+ // Array lookup, should be fast
+ ref_name = sam_hdr_tid2name(header, tid);
+ *ref_name_out = ref_name;
+
+ // Return a cached entry, if available
+ if (cache->refs && tid >= 0 && tid < cache->nref
+ && cache->refs[tid].ref) {
+ assert(cache->last_ref == NULL);
+ *ref_out = cache->refs[tid].ref;
+ *len_out = cache->refs[tid].len;
+ cache->last_tid = tid;
+ return 0;
+ }
+
+ // Try to get the reference
+ if (ref_name)
+ ref = fai_fetch64(fai, ref_name, &len);
+
+ if (!ref) {
+ // Historically, calmd doesn't worry too much about missing refs
+ *ref_out = NULL;
+ *len_out = 0;
+ return 0;
+ }
+
+ if (!cache->refs && cache->last_tid > tid) {
+ // Going backwards throught the list of tids implies
+ // a non-position-ordered file, so turn on caching mode
+ cache->nref = sam_hdr_nref(header);
+ if (cache->nref < 0) {
+ print_error("calmd", "couldn't get number of refs from header");
+ return -1;
+ }
+ if (cache->nref > 0) {
+ cache->refs = calloc(cache->nref, sizeof(cache->refs[0]));
+ if (!cache->refs) {
+ print_error_errno("calmd",
+ "couldn't allocate reference cache");
+ return -1;
+ }
+ // Add the reference we already have as the first entry
+ if (cache->last_tid >= 0 && cache->last_tid < cache->nref) {
+ cache->refs[cache->last_tid].ref = cache->last_ref;
+ cache->refs[cache->last_tid].len = cache->last_len;
+ } else {
+ free(cache->last_ref);
+ }
+ cache->last_ref = NULL;
+ }
+ }
+
+ if (cache->refs) {
+ assert(cache->last_ref == NULL); // Shouldn't be set when caching
+ // Add the new reference to the cache
+ if (tid >= 0 && tid < cache->nref) {
+ cache->refs[tid].ref = ref;
+ cache->refs[tid].len = len;
+ }
+ } else {
+ // Streaming mode - free the last ref and replace it with this one
+ free(cache->last_ref);
+ cache->last_ref = ref;
+ cache->last_len = len;
+ }
+
+ *ref_out = ref;
+ *len_out = len;
+ cache->last_tid = tid;
+ return 0;
+}
+
+static void refs_destroy(ref_cache *cache) {
+ if (cache->refs) {
+ int i;
+ assert(cache->last_ref == NULL);
+ for (i = 0; i < cache->nref; i++)
+ free(cache->refs[i].ref);
+ free(cache->refs);
+ } else {
+ free(cache->last_ref);
+ }
+}
+
int calmd_usage() {
fprintf(stderr,
"Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
int bam_fillmd(int argc, char *argv[])
{
- int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0;
- hts_pos_t len;
+ int c, flt_flag, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0;
+ hts_pos_t len = 0;
htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
sam_hdr_t *header = NULL;
faidx_t *fai = NULL;
char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
+ ref_cache refs = { NULL, NULL, 0, 0, -2 };
const char *ref_name = NULL;
bam1_t *b = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
}
while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= 0) {
- if (tid != b->core.tid) {
- free(ref);
- ref = NULL;
- len = 0;
- ref_name = sam_hdr_tid2name(header, b->core.tid);
- if (ref_name) {
- ref = fai_fetch64(fai, ref_name, &len);
+ if (refs.last_tid != b->core.tid) {
+ if (get_ref(fai, header, &refs, b->core.tid,
+ &ref, &ref_name, &len) < 0) {
+ goto fail;
}
- tid = b->core.tid;
if (ref == 0) { // FIXME: Should this always be fatal?
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
ref_name ? ref_name : "(unknown)");
sam_hdr_destroy(header);
free(arg_list);
- free(ref);
+ refs_destroy(&refs);
fai_destroy(fai);
sam_close(fp);
if (sam_close(fpout) < 0) {
fail:
free(arg_list);
- free(ref);
+ refs_destroy(&refs);
if (b) bam_destroy1(b);
if (header) sam_hdr_destroy(header);
if (fai) fai_destroy(fai);
/* bam_md.c -- calmd subcommand.
- Copyright (C) 2009-2011, 2014-2015, 2019-2020 Genome Research Ltd.
+ Copyright (C) 2009-2011, 2014-2015, 2019-2020, 2022 Genome Research Ltd.
Portions copyright (C) 2009-2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <ctype.h>
#include <limits.h>
#include <errno.h>
+#include <assert.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
#define UPDATE_MD 16
#define HASH_QNM 32
+typedef struct cached_ref_entry {
+ char *ref;
+ hts_pos_t len;
+} cached_ref_entry;
+
+typedef struct ref_cache {
+ cached_ref_entry *refs;
+ char *last_ref;
+ hts_pos_t last_len;
+ int nref;
+ int last_tid;
+} ref_cache;
+
int bam_aux_drop_other(bam1_t *b, uint8_t *s);
static int bam_fillmd1_core(const char *ref_name, bam1_t *b, char *ref,
return bam_fillmd1_core(NULL, b, ref, INT_MAX, flag, 0, quiet_mode, NULL);
}
+// Get a new reference sequence.
+// For position-sorted inputs, the previous reference should never be
+// needed again and can be discarded to save memory. For other orderings,
+// references are stored in a cache in case they're required in the future.
+// The caching mode is turned on if the requested tid is less than the last
+// one used, indicating the file ordering doesn't match the sequence dictionary.
+static int get_ref(faidx_t *fai, sam_hdr_t *header, ref_cache *cache,
+ int tid, char **ref_out, const char **ref_name_out,
+ hts_pos_t *len_out)
+{
+ char *ref = NULL;
+ const char *ref_name;
+ hts_pos_t len = 0;
+
+ // This should only be called when tid changes
+ assert(tid != cache->last_tid);
+
+ // Array lookup, should be fast
+ ref_name = sam_hdr_tid2name(header, tid);
+ *ref_name_out = ref_name;
+
+ // Return a cached entry, if available
+ if (cache->refs && tid >= 0 && tid < cache->nref
+ && cache->refs[tid].ref) {
+ assert(cache->last_ref == NULL);
+ *ref_out = cache->refs[tid].ref;
+ *len_out = cache->refs[tid].len;
+ cache->last_tid = tid;
+ return 0;
+ }
+
+ // Try to get the reference
+ if (ref_name)
+ ref = fai_fetch64(fai, ref_name, &len);
+
+ if (!ref) {
+ // Historically, calmd doesn't worry too much about missing refs
+ *ref_out = NULL;
+ *len_out = 0;
+ return 0;
+ }
+
+ if (!cache->refs && cache->last_tid > tid) {
+ // Going backwards throught the list of tids implies
+ // a non-position-ordered file, so turn on caching mode
+ cache->nref = sam_hdr_nref(header);
+ if (cache->nref < 0) {
+ print_error("calmd", "couldn't get number of refs from header");
+ return -1;
+ }
+ if (cache->nref > 0) {
+ cache->refs = calloc(cache->nref, sizeof(cache->refs[0]));
+ if (!cache->refs) {
+ print_error_errno("calmd",
+ "couldn't allocate reference cache");
+ return -1;
+ }
+ // Add the reference we already have as the first entry
+ if (cache->last_tid >= 0 && cache->last_tid < cache->nref) {
+ cache->refs[cache->last_tid].ref = cache->last_ref;
+ cache->refs[cache->last_tid].len = cache->last_len;
+ } else {
+ free(cache->last_ref);
+ }
+ cache->last_ref = NULL;
+ }
+ }
+
+ if (cache->refs) {
+ assert(cache->last_ref == NULL); // Shouldn't be set when caching
+ // Add the new reference to the cache
+ if (tid >= 0 && tid < cache->nref) {
+ cache->refs[tid].ref = ref;
+ cache->refs[tid].len = len;
+ }
+ } else {
+ // Streaming mode - free the last ref and replace it with this one
+ free(cache->last_ref);
+ cache->last_ref = ref;
+ cache->last_len = len;
+ }
+
+ *ref_out = ref;
+ *len_out = len;
+ cache->last_tid = tid;
+ return 0;
+}
+
+static void refs_destroy(ref_cache *cache) {
+ if (cache->refs) {
+ int i;
+ assert(cache->last_ref == NULL);
+ for (i = 0; i < cache->nref; i++)
+ free(cache->refs[i].ref);
+ free(cache->refs);
+ } else {
+ free(cache->last_ref);
+ }
+}
+
int calmd_usage() {
fprintf(samtools_stderr,
"Usage: samtools calmd [-eubrAESQ] <aln.bam> <ref.fasta>\n"
int bam_fillmd(int argc, char *argv[])
{
- int c, flt_flag, tid = -2, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0;
- hts_pos_t len;
+ int c, flt_flag, ret, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag, quiet_mode, no_pg = 0;
+ hts_pos_t len = 0;
htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
sam_hdr_t *header = NULL;
faidx_t *fai = NULL;
char *ref = NULL, mode_w[8], *ref_file, *arg_list = NULL;
+ ref_cache refs = { NULL, NULL, 0, 0, -2 };
const char *ref_name = NULL;
bam1_t *b = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
}
while ((ret = sam_read1(fp, header, b)) >= 0) {
if (b->core.tid >= 0) {
- if (tid != b->core.tid) {
- free(ref);
- ref = NULL;
- len = 0;
- ref_name = sam_hdr_tid2name(header, b->core.tid);
- if (ref_name) {
- ref = fai_fetch64(fai, ref_name, &len);
+ if (refs.last_tid != b->core.tid) {
+ if (get_ref(fai, header, &refs, b->core.tid,
+ &ref, &ref_name, &len) < 0) {
+ goto fail;
}
- tid = b->core.tid;
if (ref == 0) { // FIXME: Should this always be fatal?
fprintf(samtools_stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
ref_name ? ref_name : "(unknown)");
sam_hdr_destroy(header);
free(arg_list);
- free(ref);
+ refs_destroy(&refs);
fai_destroy(fai);
sam_close(fp);
if (sam_close(fpout) < 0) {
fail:
free(arg_list);
- free(ref);
+ refs_destroy(&refs);
if (b) bam_destroy1(b);
if (header) sam_hdr_destroy(header);
if (fai) fai_destroy(fai);
fprintf(fp,
" -r, --region REG region in which pileup is generated\n"
" -R, --ignore-RG ignore RG tags (one BAM = one sample)\n"
-" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require);
+" --rf, --incl-flags STR|INT\n"
+" required flags: only include reads with any of\n"
+" the mask bits set [%s]\n", tmp_require);
fprintf(fp,
-" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n"
+" --ff, --excl-flags STR|INT\n"
+" filter flags: skip reads with any of the mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
" -x, --ignore-overlaps-removal, --disable-overlap-removal\n"
fprintf(fp,
" -r, --region REG region in which pileup is generated\n"
" -R, --ignore-RG ignore RG tags (one BAM = one sample)\n"
-" --rf, --incl-flags STR|INT required flags: include reads with any of the mask bits set [%s]\n", tmp_require);
+" --rf, --incl-flags STR|INT\n"
+" required flags: only include reads with any of\n"
+" the mask bits set [%s]\n", tmp_require);
fprintf(fp,
-" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n"
+" --ff, --excl-flags STR|INT\n"
+" filter flags: skip reads with any of the mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
" -x, --ignore-overlaps-removal, --disable-overlap-removal\n"
#include "htslib/sam.h"
#include "htslib/hts_endian.h"
#include "htslib/cram.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "samtools.h"
#include "bedidx.h"
#include "bam.h"
+#define BAM_BLOCK_SIZE 2*1024*1024
+#define MAX_TMP_FILES 64
// Struct which contains the sorting key for TemplateCoordinate sort.
typedef struct {
static SamOrder g_sam_order = Coordinate;
static char g_sort_tag[2] = {0,0};
+#define is_digit(c) ((c)<='9' && (c)>='0')
static int strnum_cmp(const char *_a, const char *_b)
{
const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
const unsigned char *pa = a, *pb = b;
while (*pa && *pb) {
- if (isdigit(*pa) && isdigit(*pb)) {
+ if (!is_digit(*pa) || !is_digit(*pb)) {
+ if (*pa != *pb)
+ return (int)*pa - (int)*pb;
+ ++pa; ++pb;
+ } else {
+ // skip leading zeros
while (*pa == '0') ++pa;
while (*pb == '0') ++pb;
- while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
- if (isdigit(*pa) && isdigit(*pb)) {
- int i = 0;
- while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
- return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
- } else if (isdigit(*pa)) return 1;
- else if (isdigit(*pb)) return -1;
- else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
- } else {
- if (*pa != *pb) return (int)*pa - (int)*pb;
- ++pa; ++pb;
+
+ // skip matching digits
+ while (is_digit(*pa) && *pa == *pb)
+ pa++, pb++;
+
+ // Now mismatching, so see which ends the number sooner
+ int diff = (int)*pa - (int)*pb;
+ while (is_digit(*pa) && is_digit(*pb))
+ pa++, pb++;
+
+ if (is_digit(*pa))
+ return 1; // pa still going, so larger
+ else if (is_digit(*pb))
+ return -1; // pb still going, so larger
+ else if (diff)
+ return diff; // same length, so earlier diff
}
}
return *pa? 1 : *pb? -1 : 0;
print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
+ hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
print_error(cmd, "failed to read header from \"%s\"", fn[i]);
print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
+ hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
if (!no_pg && sam_hdr_add_pg(hout, "samtools",
"VN", samtools_version(),
arg_list ? "CL": NULL,
int n, char * const *fn, int num_in_mem,
buf_region *in_mem, bam1_tag *buf,
template_coordinate_keys_t *keys,
- khash_t(const_c2c) *lib_lookup, int n_threads,
+ khash_t(const_c2c) *lib_lookup,
+ htsThreadPool *htspool,
const char *cmd, const htsFormat *in_fmt,
const htsFormat *out_fmt, char *arg_list, int no_pg,
int write_index) {
print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
+ hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
+ if (htspool->pool)
+ hts_set_opt(fp[i], HTS_OPT_THREAD_POOL, htspool);
// Read header ...
hin = sam_hdr_read(fp[i]);
print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
+ hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
if (!no_pg && sam_hdr_add_pg(hout, "samtools",
"VN", samtools_version(),
return -1;
}
- if (n_threads > 1) hts_set_threads(fpout, n_threads);
+ if (htspool->pool)
+ hts_set_opt(fpout, HTS_OPT_THREAD_POOL, htspool);
if (sam_hdr_write(fpout, hout) != 0) {
print_error_errno(cmd, "failed to write header to \"%s\"", out);
typedef struct {
size_t buf_len;
- const char *prefix;
bam1_tag *buf;
const sam_hdr_t *h;
- char *tmpfile_name;
- int index;
int error;
- int no_save;
int large_pos;
int minimiser_kmer;
} worker_t;
fp = sam_open_format(fn, mode, fmt);
if (fp == NULL) return -1;
+ hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(),
arg_list ? "CL": NULL,
arg_list ? arg_list : NULL,
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
- char *name;
- size_t name_len;
w->error = 0;
- w->tmpfile_name = NULL;
switch (g_sam_order) {
case Coordinate:
ks_mergesort(sort, w->buf_len, w->buf, 0);
}
- if (w->no_save)
- return 0;
-
- name_len = strlen(w->prefix) + 30;
- name = (char*)calloc(name_len, 1);
- if (!name) { w->error = errno; return 0; }
- const int MAX_TRIES = 1000;
- int tries = 0;
- for (;;) {
- if (tries) {
- snprintf(name, name_len, "%s.%.4d-%.3d.bam",
- w->prefix, w->index, tries);
- } else {
- snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index);
- }
-
- if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1",
- w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) {
- break;
- }
- if (errno == EEXIST && tries < MAX_TRIES) {
- tries++;
- } else {
- w->error = errno;
- break;
- }
- }
-
- if (w->error) {
- free(name);
- } else {
- w->tmpfile_name = name;
- }
return 0;
}
-static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
- const sam_hdr_t *h, int n_threads, buf_region *in_mem,
- int large_pos, int minimiser_kmer, char **fns, size_t fns_size)
+static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
+ int n_threads, buf_region *in_mem,
+ int large_pos, int minimiser_kmer)
{
int i;
size_t pos, rest;
for (i = 0; i < n_threads; ++i) {
w[i].buf_len = rest / (n_threads - i);
w[i].buf = &buf[pos];
- w[i].prefix = prefix;
w[i].h = h;
- w[i].index = n_files + i;
- w[i].tmpfile_name = NULL;
w[i].large_pos = large_pos;
w[i].minimiser_kmer = minimiser_kmer;
- if (in_mem) {
- w[i].no_save = 1;
- in_mem[i].from = pos;
- in_mem[i].to = pos + w[i].buf_len;
- } else {
- w[i].no_save = 0;
- }
+ in_mem[i].from = pos;
+ in_mem[i].to = pos + w[i].buf_len;
pos += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
- if (!in_mem) {
- assert(w[i].index >= 0 && w[i].index < fns_size);
- fns[w[i].index] = w[i].tmpfile_name;
- }
if (w[i].error != 0) {
errno = w[i].error;
- print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
+ print_error_errno("sort", "failed to sort block %d", i);
n_failed++;
}
}
- if (n_failed && !in_mem) {
- // Clean up any temporary files that did get made, as we're
- // about to lose track of them
- for (i = 0; i < n_threads; ++i) {
- if (fns[w[i].index]) {
- unlink(fns[w[i].index]);
- free(fns[w[i].index]);
- fns[w[i].index] = NULL;
- }
- }
- }
- free(tid); free(w);
- if (n_failed) return -1;
- if (in_mem) return n_threads;
- return n_files + n_threads;
+ free(w);
+ free(tid);
+
+ return n_failed ? -1 : n_threads;
}
static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) {
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
- int ret = -1, res, i, nref, n_files = 0;
+ int ret = -1, res, i, nref, n_files = 0, n_big_files = 0, fn_counter = 0;
size_t max_k, k, max_mem, bam_mem_offset;
sam_hdr_t *header = NULL;
samFile *fp = NULL;
const char *new_ss = NULL;
buf_region *in_mem = NULL;
khash_t(const_c2c) *lib_lookup = NULL;
+ htsThreadPool htspool = { NULL, 0 };
int num_in_mem = 0;
int large_pos = 0;
print_error_errno("sort", "can't open \"%s\"", fn);
goto err;
}
+ hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
header = sam_hdr_read(fp);
if (header == NULL) {
print_error("sort", "failed to read header from \"%s\"", fn);
}
}
- // No gain to using the thread pool here as the flow of this code
- // is such that we are *either* reading *or* sorting. Hence a shared
- // pool makes no real difference except to reduce the thread count a little.
- if (n_threads > 1)
- hts_set_threads(fp, n_threads);
+ if (n_threads > 1) {
+ htspool.pool = hts_tpool_init(n_threads);
+ if (!htspool.pool) {
+ print_error_errno("sort", "failed to set up thread pool");
+ goto err;
+ }
+ hts_set_opt(fp, HTS_OPT_THREAD_POOL, &htspool);
+ }
if ((bam_mem = malloc(max_mem)) == NULL) {
print_error("sort", "couldn't allocate memory for bam_mem");
goto err;
}
+ in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
+ if (!in_mem) goto err;
+
// write sub files
k = max_k = bam_mem_offset = 0;
+ size_t name_len = strlen(prefix) + 30;
while ((res = sam_read1(fp, header, b)) >= 0) {
int mem_full = 0;
++k;
if (mem_full) {
- if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1),
- &fns_size, &fns, 0) < 0)
+ if (hts_resize(char *, n_files + 1, &fns_size, &fns, 0) < 0)
goto err;
- int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- NULL, large_pos, minimiser_kmer, fns, fns_size);
- if (new_n < 0) {
+
+ int sort_res = sort_blocks(k, buf, header, n_threads,
+ in_mem, large_pos, minimiser_kmer);
+ if (sort_res < 0)
+ goto err;
+
+ fns[n_files] = calloc(name_len, 1);
+ if (!fns[n_files])
+ goto err;
+ const int MAX_TRIES = 1000;
+ int tries = 0, merge_res = -1;
+ char *sort_by_tag = (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) ? sort_tag : NULL;
+ int consolidate_from = n_files;
+ if (n_files - n_big_files >= MAX_TMP_FILES/2)
+ consolidate_from = n_big_files;
+ else if (n_files >= MAX_TMP_FILES)
+ consolidate_from = 0;
+
+ for (;;) {
+ if (tries) {
+ snprintf(fns[n_files], name_len, "%s.%.4d-%.3d.bam",
+ prefix, fn_counter, tries);
+ } else {
+ snprintf(fns[n_files], name_len, "%s.%.4d.bam", prefix,
+ fn_counter);
+ }
+ if (bam_merge_simple(g_sam_order, sort_by_tag, fns[n_files],
+ large_pos ? "wzx1" : "wbx1", header,
+ n_files - consolidate_from,
+ &fns[consolidate_from], n_threads,
+ in_mem, buf, keys,
+ lib_lookup, &htspool, "sort", NULL, NULL,
+ NULL, 1, 0) >= 0) {
+ merge_res = 0;
+ break;
+ }
+ if (errno == EEXIST && tries < MAX_TRIES) {
+ tries++;
+ } else {
+ break;
+ }
+ }
+ fn_counter++;
+ if (merge_res < 0) {
+ if (errno != EEXIST)
+ unlink(fns[n_files]);
+ free(fns[n_files]);
goto err;
- } else {
- n_files = new_n;
}
+
+ if (consolidate_from < n_files) {
+ for (i = consolidate_from; i < n_files; i++) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ fns[consolidate_from] = fns[n_files];
+ n_files = consolidate_from;
+ n_big_files = consolidate_from + 1;
+ }
+
+ n_files++;
k = 0;
if (keys != NULL) keys->n = 0;
bam_mem_offset = 0;
+
}
}
if (res != -1) {
// Sort last records
if (k > 0) {
- in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
- if (!in_mem) goto err;
- num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- in_mem, large_pos, minimiser_kmer, fns, fns_size);
+ num_in_mem = sort_blocks(k, buf, header, n_threads,
+ in_mem, large_pos, minimiser_kmer);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL;
if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
n_files, fns, num_in_mem, in_mem, buf, keys,
- lib_lookup, n_threads, "sort", in_fmt, out_fmt,
+ lib_lookup, &htspool, "sort", in_fmt, out_fmt,
arg_list, no_pg, write_index) < 0) {
// Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
lib_lookup_destroy(lib_lookup);
sam_hdr_destroy(header);
if (fp) sam_close(fp);
+ if (htspool.pool)
+ hts_tpool_destroy(htspool.pool);
+
return ret;
}
#include "htslib/sam.h"
#include "htslib/hts_endian.h"
#include "htslib/cram.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "samtools.h"
#include "bedidx.h"
#include "bam.h"
+#define BAM_BLOCK_SIZE 2*1024*1024
+#define MAX_TMP_FILES 64
// Struct which contains the sorting key for TemplateCoordinate sort.
typedef struct {
static SamOrder g_sam_order = Coordinate;
static char g_sort_tag[2] = {0,0};
+#define is_digit(c) ((c)<='9' && (c)>='0')
static int strnum_cmp(const char *_a, const char *_b)
{
const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b;
const unsigned char *pa = a, *pb = b;
while (*pa && *pb) {
- if (isdigit(*pa) && isdigit(*pb)) {
+ if (!is_digit(*pa) || !is_digit(*pb)) {
+ if (*pa != *pb)
+ return (int)*pa - (int)*pb;
+ ++pa; ++pb;
+ } else {
+ // skip leading zeros
while (*pa == '0') ++pa;
while (*pb == '0') ++pb;
- while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb;
- if (isdigit(*pa) && isdigit(*pb)) {
- int i = 0;
- while (isdigit(pa[i]) && isdigit(pb[i])) ++i;
- return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb;
- } else if (isdigit(*pa)) return 1;
- else if (isdigit(*pb)) return -1;
- else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1;
- } else {
- if (*pa != *pb) return (int)*pa - (int)*pb;
- ++pa; ++pb;
+
+ // skip matching digits
+ while (is_digit(*pa) && *pa == *pb)
+ pa++, pb++;
+
+ // Now mismatching, so see which ends the number sooner
+ int diff = (int)*pa - (int)*pb;
+ while (is_digit(*pa) && is_digit(*pb))
+ pa++, pb++;
+
+ if (is_digit(*pa))
+ return 1; // pa still going, so larger
+ else if (is_digit(*pb))
+ return -1; // pb still going, so larger
+ else if (diff)
+ return diff; // same length, so earlier diff
}
}
return *pa? 1 : *pb? -1 : 0;
print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
+ hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
print_error(cmd, "failed to read header from \"%s\"", fn[i]);
print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
+ hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
if (!no_pg && sam_hdr_add_pg(hout, "samtools",
"VN", samtools_version(),
arg_list ? "CL": NULL,
int n, char * const *fn, int num_in_mem,
buf_region *in_mem, bam1_tag *buf,
template_coordinate_keys_t *keys,
- khash_t(const_c2c) *lib_lookup, int n_threads,
+ khash_t(const_c2c) *lib_lookup,
+ htsThreadPool *htspool,
const char *cmd, const htsFormat *in_fmt,
const htsFormat *out_fmt, char *arg_list, int no_pg,
int write_index) {
print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
+ hts_set_opt(fp[i], HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
+ if (htspool->pool)
+ hts_set_opt(fp[i], HTS_OPT_THREAD_POOL, htspool);
// Read header ...
hin = sam_hdr_read(fp[i]);
print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
+ hts_set_opt(fpout, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
if (!no_pg && sam_hdr_add_pg(hout, "samtools",
"VN", samtools_version(),
return -1;
}
- if (n_threads > 1) hts_set_threads(fpout, n_threads);
+ if (htspool->pool)
+ hts_set_opt(fpout, HTS_OPT_THREAD_POOL, htspool);
if (sam_hdr_write(fpout, hout) != 0) {
print_error_errno(cmd, "failed to write header to \"%s\"", out);
typedef struct {
size_t buf_len;
- const char *prefix;
bam1_tag *buf;
const sam_hdr_t *h;
- char *tmpfile_name;
- int index;
int error;
- int no_save;
int large_pos;
int minimiser_kmer;
} worker_t;
fp = sam_open_format(fn, mode, fmt);
if (fp == NULL) return -1;
+ hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
if (!no_pg && sam_hdr_add_pg((sam_hdr_t *)h, "samtools", "VN", samtools_version(),
arg_list ? "CL": NULL,
arg_list ? arg_list : NULL,
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
- char *name;
- size_t name_len;
w->error = 0;
- w->tmpfile_name = NULL;
switch (g_sam_order) {
case Coordinate:
ks_mergesort(sort, w->buf_len, w->buf, 0);
}
- if (w->no_save)
- return 0;
-
- name_len = strlen(w->prefix) + 30;
- name = (char*)calloc(name_len, 1);
- if (!name) { w->error = errno; return 0; }
- const int MAX_TRIES = 1000;
- int tries = 0;
- for (;;) {
- if (tries) {
- snprintf(name, name_len, "%s.%.4d-%.3d.bam",
- w->prefix, w->index, tries);
- } else {
- snprintf(name, name_len, "%s.%.4d.bam", w->prefix, w->index);
- }
-
- if (write_buffer(name, w->large_pos ? "wzx1" : "wbx1",
- w->buf_len, w->buf, w->h, 0, NULL, 0, NULL, 1, 0) == 0) {
- break;
- }
- if (errno == EEXIST && tries < MAX_TRIES) {
- tries++;
- } else {
- w->error = errno;
- break;
- }
- }
-
- if (w->error) {
- free(name);
- } else {
- w->tmpfile_name = name;
- }
return 0;
}
-static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
- const sam_hdr_t *h, int n_threads, buf_region *in_mem,
- int large_pos, int minimiser_kmer, char **fns, size_t fns_size)
+static int sort_blocks(size_t k, bam1_tag *buf, const sam_hdr_t *h,
+ int n_threads, buf_region *in_mem,
+ int large_pos, int minimiser_kmer)
{
int i;
size_t pos, rest;
for (i = 0; i < n_threads; ++i) {
w[i].buf_len = rest / (n_threads - i);
w[i].buf = &buf[pos];
- w[i].prefix = prefix;
w[i].h = h;
- w[i].index = n_files + i;
- w[i].tmpfile_name = NULL;
w[i].large_pos = large_pos;
w[i].minimiser_kmer = minimiser_kmer;
- if (in_mem) {
- w[i].no_save = 1;
- in_mem[i].from = pos;
- in_mem[i].to = pos + w[i].buf_len;
- } else {
- w[i].no_save = 0;
- }
+ in_mem[i].from = pos;
+ in_mem[i].to = pos + w[i].buf_len;
pos += w[i].buf_len; rest -= w[i].buf_len;
pthread_create(&tid[i], &attr, worker, &w[i]);
}
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
- if (!in_mem) {
- assert(w[i].index >= 0 && w[i].index < fns_size);
- fns[w[i].index] = w[i].tmpfile_name;
- }
if (w[i].error != 0) {
errno = w[i].error;
- print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
+ print_error_errno("sort", "failed to sort block %d", i);
n_failed++;
}
}
- if (n_failed && !in_mem) {
- // Clean up any temporary files that did get made, as we're
- // about to lose track of them
- for (i = 0; i < n_threads; ++i) {
- if (fns[w[i].index]) {
- unlink(fns[w[i].index]);
- free(fns[w[i].index]);
- fns[w[i].index] = NULL;
- }
- }
- }
- free(tid); free(w);
- if (n_failed) return -1;
- if (in_mem) return n_threads;
- return n_files + n_threads;
+ free(w);
+ free(tid);
+
+ return n_failed ? -1 : n_threads;
}
static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) {
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
- int ret = -1, res, i, nref, n_files = 0;
+ int ret = -1, res, i, nref, n_files = 0, n_big_files = 0, fn_counter = 0;
size_t max_k, k, max_mem, bam_mem_offset;
sam_hdr_t *header = NULL;
samFile *fp = NULL;
const char *new_ss = NULL;
buf_region *in_mem = NULL;
khash_t(const_c2c) *lib_lookup = NULL;
+ htsThreadPool htspool = { NULL, 0 };
int num_in_mem = 0;
int large_pos = 0;
print_error_errno("sort", "can't open \"%s\"", fn);
goto err;
}
+ hts_set_opt(fp, HTS_OPT_BLOCK_SIZE, BAM_BLOCK_SIZE);
header = sam_hdr_read(fp);
if (header == NULL) {
print_error("sort", "failed to read header from \"%s\"", fn);
}
}
- // No gain to using the thread pool here as the flow of this code
- // is such that we are *either* reading *or* sorting. Hence a shared
- // pool makes no real difference except to reduce the thread count a little.
- if (n_threads > 1)
- hts_set_threads(fp, n_threads);
+ if (n_threads > 1) {
+ htspool.pool = hts_tpool_init(n_threads);
+ if (!htspool.pool) {
+ print_error_errno("sort", "failed to set up thread pool");
+ goto err;
+ }
+ hts_set_opt(fp, HTS_OPT_THREAD_POOL, &htspool);
+ }
if ((bam_mem = malloc(max_mem)) == NULL) {
print_error("sort", "couldn't allocate memory for bam_mem");
goto err;
}
+ in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
+ if (!in_mem) goto err;
+
// write sub files
k = max_k = bam_mem_offset = 0;
+ size_t name_len = strlen(prefix) + 30;
while ((res = sam_read1(fp, header, b)) >= 0) {
int mem_full = 0;
++k;
if (mem_full) {
- if (hts_resize(char *, n_files + (n_threads > 0 ? n_threads : 1),
- &fns_size, &fns, 0) < 0)
+ if (hts_resize(char *, n_files + 1, &fns_size, &fns, 0) < 0)
goto err;
- int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- NULL, large_pos, minimiser_kmer, fns, fns_size);
- if (new_n < 0) {
+
+ int sort_res = sort_blocks(k, buf, header, n_threads,
+ in_mem, large_pos, minimiser_kmer);
+ if (sort_res < 0)
+ goto err;
+
+ fns[n_files] = calloc(name_len, 1);
+ if (!fns[n_files])
+ goto err;
+ const int MAX_TRIES = 1000;
+ int tries = 0, merge_res = -1;
+ char *sort_by_tag = (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) ? sort_tag : NULL;
+ int consolidate_from = n_files;
+ if (n_files - n_big_files >= MAX_TMP_FILES/2)
+ consolidate_from = n_big_files;
+ else if (n_files >= MAX_TMP_FILES)
+ consolidate_from = 0;
+
+ for (;;) {
+ if (tries) {
+ snprintf(fns[n_files], name_len, "%s.%.4d-%.3d.bam",
+ prefix, fn_counter, tries);
+ } else {
+ snprintf(fns[n_files], name_len, "%s.%.4d.bam", prefix,
+ fn_counter);
+ }
+ if (bam_merge_simple(g_sam_order, sort_by_tag, fns[n_files],
+ large_pos ? "wzx1" : "wbx1", header,
+ n_files - consolidate_from,
+ &fns[consolidate_from], n_threads,
+ in_mem, buf, keys,
+ lib_lookup, &htspool, "sort", NULL, NULL,
+ NULL, 1, 0) >= 0) {
+ merge_res = 0;
+ break;
+ }
+ if (errno == EEXIST && tries < MAX_TRIES) {
+ tries++;
+ } else {
+ break;
+ }
+ }
+ fn_counter++;
+ if (merge_res < 0) {
+ if (errno != EEXIST)
+ unlink(fns[n_files]);
+ free(fns[n_files]);
goto err;
- } else {
- n_files = new_n;
}
+
+ if (consolidate_from < n_files) {
+ for (i = consolidate_from; i < n_files; i++) {
+ unlink(fns[i]);
+ free(fns[i]);
+ }
+ fns[consolidate_from] = fns[n_files];
+ n_files = consolidate_from;
+ n_big_files = consolidate_from + 1;
+ }
+
+ n_files++;
k = 0;
if (keys != NULL) keys->n = 0;
bam_mem_offset = 0;
+
}
}
if (res != -1) {
// Sort last records
if (k > 0) {
- in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
- if (!in_mem) goto err;
- num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- in_mem, large_pos, minimiser_kmer, fns, fns_size);
+ num_in_mem = sort_blocks(k, buf, header, n_threads,
+ in_mem, large_pos, minimiser_kmer);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL;
if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
n_files, fns, num_in_mem, in_mem, buf, keys,
- lib_lookup, n_threads, "sort", in_fmt, out_fmt,
+ lib_lookup, &htspool, "sort", in_fmt, out_fmt,
arg_list, no_pg, write_index) < 0) {
// Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
lib_lookup_destroy(lib_lookup);
sam_hdr_destroy(header);
if (fp) sam_close(fp);
+ if (htspool.pool)
+ hts_tpool_destroy(htspool.pool);
+
return ret;
}
/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013, 2015-2019 Genome Research Ltd.
+ Copyright (C) 2013, 2015-2019,2023 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
// Read input, distribute reads pseudo-randomly into n_files temporary
// files.
- fp = sam_open_format(fn, "r", &ga->in);
+ fp = sam_open_format(fn ? fn : "-", "r", &ga->in);
if (fp == NULL) {
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
static int usage(FILE *fp, int n_files, int reads_store) {
fprintf(fp,
- "Usage: samtools collate [-Ou] [-o <name>] [-n nFiles] [-l cLevel] <in.bam> [<prefix>]\n\n"
+ "Usage: samtools collate [options...] <in.bam> [<prefix>]\n\n"
"Options:\n"
- " -O output to stdout\n"
- " -o output file name (use prefix if not set)\n"
- " -u uncompressed BAM output\n"
- " -f fast (only primary alignments)\n"
- " -r working reads stored (with -f) [%d]\n" // reads_store
- " -l INT compression level [%d]\n" // DEF_CLEVEL
- " -n INT number of temporary files [%d]\n" // n_files
+ " -O Output to stdout\n"
+ " -o Output file name (use prefix if not set)\n"
+ " -u Uncompressed BAM output\n"
+ " -f Fast (only primary alignments)\n"
+ " -r Working reads stored (with -f) [%d]\n" // reads_store
+ " -l INT Compression level [%d]\n" // DEF_CLEVEL
+ " -n INT Number of temporary files [%d]\n" // n_files
+ " -T PREFIX\n"
+ " Write tempory files to PREFIX.nnnn.bam\n"
" --no-PG do not add a PG line\n",
reads_store, DEF_CLEVEL, n_files);
return 1;
}
-char * generate_prefix() {
+char *generate_prefix(const char *out_fn) {
char *prefix;
unsigned int pid = getpid();
+
+ if (out_fn && !(*out_fn == '-' && out_fn[1] == '\0')) {
+ // <out_fn>.<collate><pid>.<nnnn>.<bam>
+ size_t plen = strlen(out_fn) + 50;
+ if (!(prefix = malloc(plen))) {
+ perror("collate");
+ return NULL;
+ }
+ snprintf(prefix, plen, "%s.collate%x", out_fn, pid);
+ return prefix;
+ }
+
#ifdef _WIN32
# define PREFIX_LEN (MAX_PATH + 16)
DWORD ret;
snprintf(prefix + ret, PREFIX_LEN - ret, "\\%x", pid);
return prefix;
#else
-# define PREFIX_LEN 64
- prefix = malloc(PREFIX_LEN);
+ char *tmp_env = getenv("TMPDIR");
+ if (!tmp_env)
+ tmp_env = "/tmp";
+
+ size_t prefix_len = strlen(tmp_env)+20;
+ prefix = malloc(prefix_len);
if (!prefix) {
perror("collate");
return NULL;
}
- snprintf(prefix, PREFIX_LEN, "/tmp/collate%x", pid);
+ snprintf(prefix, prefix_len, "%s/collate%x", tmp_env, pid);
+
return prefix;
#endif
}
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:T:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'o': output_file = optarg; break;
case 'f': fast_coll = 1; break;
case 'r': reads_store = atoi(optarg); break;
+ case 'T': prefix = optarg; break;
case 1: no_pg = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
}
if (is_un) clevel = 0;
if (argc >= optind + 2) prefix = argv[optind+1];
+ if (argc == optind) {
+ if (argc > 1 || !isatty(STDIN_FILENO))
+ fprintf(stderr, "collate: no input filename specified.\n");
+ return usage(argc > 1 || !isatty(STDIN_FILENO) ? stderr : stdout,
+ n_files, reads_store);
+ }
if (!(prefix || is_stdout || output_file))
return usage(stderr, n_files, reads_store);
if (is_stdout && output_file) {
return usage(stderr, n_files, reads_store);
}
if (!prefix) {
- prefix = generate_prefix();
+ prefix = generate_prefix(output_file);
pre_mem = 1;
}
/* bamshuf.c -- collate subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013, 2015-2019 Genome Research Ltd.
+ Copyright (C) 2013, 2015-2019,2023 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
// Read input, distribute reads pseudo-randomly into n_files temporary
// files.
- fp = sam_open_format(fn, "r", &ga->in);
+ fp = sam_open_format(fn ? fn : "-", "r", &ga->in);
if (fp == NULL) {
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
static int usage(FILE *fp, int n_files, int reads_store) {
fprintf(fp,
- "Usage: samtools collate [-Ou] [-o <name>] [-n nFiles] [-l cLevel] <in.bam> [<prefix>]\n\n"
+ "Usage: samtools collate [options...] <in.bam> [<prefix>]\n\n"
"Options:\n"
- " -O output to samtools_stdout\n"
- " -o output file name (use prefix if not set)\n"
- " -u uncompressed BAM output\n"
- " -f fast (only primary alignments)\n"
- " -r working reads stored (with -f) [%d]\n" // reads_store
- " -l INT compression level [%d]\n" // DEF_CLEVEL
- " -n INT number of temporary files [%d]\n" // n_files
+ " -O Output to samtools_stdout\n"
+ " -o Output file name (use prefix if not set)\n"
+ " -u Uncompressed BAM output\n"
+ " -f Fast (only primary alignments)\n"
+ " -r Working reads stored (with -f) [%d]\n" // reads_store
+ " -l INT Compression level [%d]\n" // DEF_CLEVEL
+ " -n INT Number of temporary files [%d]\n" // n_files
+ " -T PREFIX\n"
+ " Write tempory files to PREFIX.nnnn.bam\n"
" --no-PG do not add a PG line\n",
reads_store, DEF_CLEVEL, n_files);
return 1;
}
-char * generate_prefix() {
+char *generate_prefix(const char *out_fn) {
char *prefix;
unsigned int pid = getpid();
+
+ if (out_fn && !(*out_fn == '-' && out_fn[1] == '\0')) {
+ // <out_fn>.<collate><pid>.<nnnn>.<bam>
+ size_t plen = strlen(out_fn) + 50;
+ if (!(prefix = malloc(plen))) {
+ perror("collate");
+ return NULL;
+ }
+ snprintf(prefix, plen, "%s.collate%x", out_fn, pid);
+ return prefix;
+ }
+
#ifdef _WIN32
# define PREFIX_LEN (MAX_PATH + 16)
DWORD ret;
snprintf(prefix + ret, PREFIX_LEN - ret, "\\%x", pid);
return prefix;
#else
-# define PREFIX_LEN 64
- prefix = malloc(PREFIX_LEN);
+ char *tmp_env = getenv("TMPDIR");
+ if (!tmp_env)
+ tmp_env = "/tmp";
+
+ size_t prefix_len = strlen(tmp_env)+20;
+ prefix = malloc(prefix_len);
if (!prefix) {
perror("collate");
return NULL;
}
- snprintf(prefix, PREFIX_LEN, "/tmp/collate%x", pid);
+ snprintf(prefix, prefix_len, "%s/collate%x", tmp_env, pid);
+
return prefix;
#endif
}
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "n:l:uOo:@:fr:T:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
case 'o': output_file = optarg; break;
case 'f': fast_coll = 1; break;
case 'r': reads_store = atoi(optarg); break;
+ case 'T': prefix = optarg; break;
case 1: no_pg = 1; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
}
if (is_un) clevel = 0;
if (argc >= optind + 2) prefix = argv[optind+1];
+ if (argc == optind) {
+ if (argc > 1 || !isatty(STDIN_FILENO))
+ fprintf(samtools_stderr, "collate: no input filename specified.\n");
+ return usage(argc > 1 || !isatty(STDIN_FILENO) ? samtools_stderr : samtools_stdout,
+ n_files, reads_store);
+ }
if (!(prefix || is_stdout || output_file))
return usage(samtools_stderr, n_files, reads_store);
if (is_stdout && output_file) {
return usage(samtools_stderr, n_files, reads_store);
}
if (!prefix) {
- prefix = generate_prefix();
+ prefix = generate_prefix(output_file);
pre_mem = 1;
}
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2022 Genome Research Ltd.
+ Copyright (C) 2008-2023 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
int main_samples(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
int main_reference(int argc, char *argv[]);
+int main_reset(int argc, char *argv[]);
+int main_cram_size(int argc, char *argv[]);
const char *samtools_version()
{
static void long_version(void) {
printf("samtools %s\n"
"Using htslib %s\n"
- "Copyright (C) 2022 Genome Research Ltd.\n",
+ "Copyright (C) 2023 Genome Research Ltd.\n",
samtools_version(), hts_version());
printf("\nSamtools compilation details:\n");
" fasta converts a BAM to a FASTA\n"
" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
" reference Generates a reference from aligned data\n"
+" reset Reverts aligner changes in reads\n"
"\n"
" -- Statistics\n"
" bedcov read depth per BED region\n"
" depth compute the depth\n"
" flagstat simple stats\n"
" idxstats BAM index stats\n"
+" cram-size list CRAM Content-ID and Data-Series sizes\n"
" phase phase heterozygotes\n"
" stats generate stats (former bamcheck)\n"
" ampliconstats generate amplicon specific stats\n"
else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1);
else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1);
else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1);
+ else if (strcmp(argv[1], "cram-size") == 0) ret = main_cram_size(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0 || \
strcmp(argv[1], "--version") == 0)
long_version();
else if (strcmp(argv[1], "--version-only") == 0) {
printf("%s+htslib-%s\n", samtools_version(), hts_version());
}
+ else if (strcmp(argv[1], "reset") == 0) ret = main_reset(argc-1, argv+1);
else {
fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2022 Genome Research Ltd.
+ Copyright (C) 2008-2023 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
int main_samples(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
int main_reference(int argc, char *argv[]);
+int main_reset(int argc, char *argv[]);
+int main_cram_size(int argc, char *argv[]);
const char *samtools_version()
{
static void long_version(void) {
fprintf(samtools_stdout, "samtools %s\n"
"Using htslib %s\n"
- "Copyright (C) 2022 Genome Research Ltd.\n",
+ "Copyright (C) 2023 Genome Research Ltd.\n",
samtools_version(), hts_version());
fprintf(samtools_stdout, "\nSamtools compilation details:\n");
" fasta converts a BAM to a FASTA\n"
" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
" reference Generates a reference from aligned data\n"
+" reset Reverts aligner changes in reads\n"
"\n"
" -- Statistics\n"
" bedcov read depth per BED region\n"
" depth compute the depth\n"
" flagstat simple stats\n"
" idxstats BAM index stats\n"
+" cram-size list CRAM Content-ID and Data-Series sizes\n"
" phase phase heterozygotes\n"
" stats generate stats (former bamcheck)\n"
" ampliconstats generate amplicon specific stats\n"
else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1);
else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1);
else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1);
+ else if (strcmp(argv[1], "cram-size") == 0) ret = main_cram_size(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0 || \
strcmp(argv[1], "--version") == 0)
long_version();
else if (strcmp(argv[1], "--version-only") == 0) {
fprintf(samtools_stdout, "%s+htslib-%s\n", samtools_version(), hts_version());
}
+ else if (strcmp(argv[1], "reset") == 0) ret = main_reset(argc-1, argv+1);
else {
fprintf(samtools_stderr, "[main] unrecognized command '%s'\n", argv[1]);
return 1;
p->base4 = 16;
p->padding = 1;
if (p->seq_offset < b->core.l_qseq)
- p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]);
else
p->qual = 0;
} else {
p->base = '*';
p->base4 = 16;
if (p->seq_offset+1 < b->core.l_qseq)
- p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]);
else
- p->qual = (p->qual + p->b_qual[p->seq_offset])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset]);
break;
case BAM_CPAD:
p->base = '*';
p->base4 = 16;
if (p->seq_offset+1 < b->core.l_qseq)
- p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]);
else
- p->qual = (p->qual + p->b_qual[p->seq_offset])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset]);
break;
case BAM_CREF_SKIP:
samFile *fp,
sam_hdr_t *h,
pileup_t *p),
- int (*seq_add)(void *client_data,
- samFile *fp,
- sam_hdr_t *h,
- pileup_t *p,
- int depth,
- hts_pos_t pos,
- int nth,
- int is_insert),
+ int (*seq_column)(void *client_data,
+ samFile *fp,
+ sam_hdr_t *h,
+ pileup_t *p,
+ int depth,
+ hts_pos_t pos,
+ int nth,
+ int is_insert),
+ void (*seq_free)(void *client_data,
+ samFile *fp,
+ sam_hdr_t *h,
+ pileup_t *p),
void *client_data) {
int ret = -1;
pileup_t *phead = NULL, *p, *pfree = NULL, *last, *next, *ptail = NULL;
ptail = phead;
/* Call our function on phead linked list */
- v = seq_add(client_data, fp, h, phead, depth,
+ v = seq_column(client_data, fp, h, phead, depth,
#ifdef START_WITH_DEL
- col-1,
+ col-1,
#else
- col,
+ col,
#endif
- nth, is_insert);
+ nth, is_insert);
/* Remove dead seqs */
for (p = eof_head ; p; p = p->eofn) {
p->next = pfree;
pfree = p;
+
+ if (seq_free)
+ seq_free(client_data, fp, h, p);
}
if (v == 1)
/* Tidy up */
for (p = pfree; p; p = next) {
next = p->next;
+ if (seq_free)
+ seq_free(client_data, fp, h, p);
free(p->b.data);
free(p);
}
p->base4 = 16;
p->padding = 1;
if (p->seq_offset < b->core.l_qseq)
- p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]);
else
p->qual = 0;
} else {
p->base = '*';
p->base4 = 16;
if (p->seq_offset+1 < b->core.l_qseq)
- p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]);
else
- p->qual = (p->qual + p->b_qual[p->seq_offset])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset]);
break;
case BAM_CPAD:
p->base = '*';
p->base4 = 16;
if (p->seq_offset+1 < b->core.l_qseq)
- p->qual = (p->qual + p->b_qual[p->seq_offset+1])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset+1]);
else
- p->qual = (p->qual + p->b_qual[p->seq_offset])/2;
+ p->qual = MIN(p->qual, p->b_qual[p->seq_offset]);
break;
case BAM_CREF_SKIP:
samFile *fp,
sam_hdr_t *h,
pileup_t *p),
- int (*seq_add)(void *client_data,
- samFile *fp,
- sam_hdr_t *h,
- pileup_t *p,
- int depth,
- hts_pos_t pos,
- int nth,
- int is_insert),
+ int (*seq_column)(void *client_data,
+ samFile *fp,
+ sam_hdr_t *h,
+ pileup_t *p,
+ int depth,
+ hts_pos_t pos,
+ int nth,
+ int is_insert),
+ void (*seq_free)(void *client_data,
+ samFile *fp,
+ sam_hdr_t *h,
+ pileup_t *p),
void *client_data) {
int ret = -1;
pileup_t *phead = NULL, *p, *pfree = NULL, *last, *next, *ptail = NULL;
ptail = phead;
/* Call our function on phead linked list */
- v = seq_add(client_data, fp, h, phead, depth,
+ v = seq_column(client_data, fp, h, phead, depth,
#ifdef START_WITH_DEL
- col-1,
+ col-1,
#else
- col,
+ col,
#endif
- nth, is_insert);
+ nth, is_insert);
/* Remove dead seqs */
for (p = eof_head ; p; p = p->eofn) {
p->next = pfree;
pfree = p;
+
+ if (seq_free)
+ seq_free(client_data, fp, h, p);
}
if (v == 1)
/* Tidy up */
for (p = pfree; p; p = next) {
next = p->next;
+ if (seq_free)
+ seq_free(client_data, fp, h, p);
free(p->b.data);
free(p);
}
bam1_t b; // Bam entry associated with struct
} pileup_t;
+/*
+ * The pileup loop executes and calls callbacks to perform the work.
+ *
+ * seq_fetch returns the next sequence. Return 0 from this indicates no
+ * more data.
+ *
+ * seq_init is called, if non-NULL, when a sequence is added to the pileup,
+ * seq_free likewise, if non-NULL, is called when a sequence is removed
+ * from the pileup.
+ * These two functions are akin to the constructor and destructors added
+ * to mpileup.
+ *
+ * seq_column is the primary work horse which is executed for each
+ * reference position, and for each inserted base per ref pos.
+ *
+ * If we were to invert this from a loop generating callbacks to a polled
+ * style interface like mpileup, then the seq_column bit would be dropped
+ * and replaced by the returned pileup and associated parameters.
+ */
int pileup_loop(samFile *fp,
sam_hdr_t *h,
int (*seq_fetch)(void *client_data,
samFile *fp,
sam_hdr_t *h,
pileup_t *p),
- int (*seq_add)(void *client_data,
- samFile *fp,
- sam_hdr_t *h,
- pileup_t *p,
- int depth,
- hts_pos_t pos,
- int nth,
- int is_insert),
+ int (*seq_column)(void *client_data,
+ samFile *fp,
+ sam_hdr_t *h,
+ pileup_t *p,
+ int depth,
+ hts_pos_t pos,
+ int nth,
+ int is_insert),
+ void (*seq_free)(void *client_data,
+ samFile *fp,
+ sam_hdr_t *h,
+ pileup_t *p),
void *client_data);
--- /dev/null
+/* cram_size.c -- produces summary of the size of each cram data-series
+
+ Copyright (C) 2023 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+// TODO: add range query. Eg the ability to look at size for "*" only
+// (unmapped), or in a specific region such as a centromere.
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <strings.h>
+
+#include "htslib/bgzf.h"
+#include "htslib/sam.h"
+#include "htslib/cram.h"
+#include "htslib/kstring.h"
+#include "htslib/khash.h"
+#include "samtools.h"
+#include "sam_opts.h"
+#include "htslib/hfile.h"
+
+/*----------------------------------------------------------------------
+ * Compression method handling
+ */
+
+// A numeric version of the cram_method_details struct.
+// We expand the myriad of struct field combinations into a single
+// enumerated type so we can index and accumulate statistics for
+// purposes of reporting.
+//
+// These expanded numeric values have no definition within CRAM itself
+// and never occur within the file format.
+enum comp_expanded {
+ //----
+ // Copies from htslib cram_block_method enum
+ COMP_RAW = CRAM_COMP_RAW,
+ COMP_GZIP = CRAM_COMP_GZIP,
+ COMP_BZIP2 = CRAM_COMP_BZIP2,
+ COMP_LZMA = CRAM_COMP_LZMA,
+ COMP_RANS8 = CRAM_COMP_RANS4x8,
+ COMP_RANS16 = CRAM_COMP_RANSNx16,
+ COMP_ARITH = CRAM_COMP_ARITH,
+ COMP_FQZ = CRAM_COMP_FQZ,
+ COMP_TOK3 = CRAM_COMP_TOK3,
+
+ //----
+ // Localised variants.
+
+ // Gzip
+ COMP_GZIP_1,
+ COMP_GZIP_9,
+
+ // Bzip2
+ COMP_BZIP2_1,
+ COMP_BZIP2_2,
+ COMP_BZIP2_3,
+ COMP_BZIP2_4,
+ COMP_BZIP2_5,
+ COMP_BZIP2_6,
+ COMP_BZIP2_7,
+ COMP_BZIP2_8,
+ COMP_BZIP2_9,
+
+ // rans 4x8
+ COMP_RANS4x8_O0,
+ COMP_RANS4x8_O1,
+
+ // rans Nx16. Note order here is to enable selection via bit-fields
+ // bit 0: O0/O1
+ // bit 1: RLE
+ // bit 2: PACK
+ // bit 3: 32x16
+ COMP_RANS4x16_O0,
+ COMP_RANS4x16_O1,
+ COMP_RANS4x16_O0R, // +RLE
+ COMP_RANS4x16_O1R,
+ COMP_RANS4x16_O0P, // +PACK
+ COMP_RANS4x16_O1P,
+ COMP_RANS4x16_O0PR, // +PACK+RLE
+ COMP_RANS4x16_O1PR,
+ COMP_RANS32x16_O0, // SIMD variants
+ COMP_RANS32x16_O1,
+ COMP_RANS32x16_O0R, // +RLE
+ COMP_RANS32x16_O1R,
+ COMP_RANS32x16_O0P, // +PACK
+ COMP_RANS32x16_O1P,
+ COMP_RANS32x16_O0PR, // +PACK+RLE
+ COMP_RANS32x16_O1PR,
+ COMP_RANSNx16_STRIPE,
+ COMP_RANSNx16_CAT,
+
+ // Arith
+ COMP_ARITH_O0,
+ COMP_ARITH_O1,
+ COMP_ARITH_O0R, // +RLE
+ COMP_ARITH_O1R,
+ COMP_ARITH_O0P, // +PACK
+ COMP_ARITH_O1P,
+ COMP_ARITH_O0PR, // +PACK+RLE
+ COMP_ARITH_O1PR,
+ COMP_ARITH_STRIPE,
+ COMP_ARITH_CAT, // no entropy encoder
+ COMP_ARITH_EXT, // external entropy encode
+
+ // Nake tokeniser
+ COMP_TOK3_RANS,
+ COMP_TOK3_ARITH,
+
+ // To mark maximum size
+ COMP_MAX,
+};
+
+static enum comp_expanded comp_method2expanded(cram_method_details *cm) {
+ switch (cm->method) {
+ case CRAM_COMP_GZIP:
+ switch (cm->level) {
+ case 1: return COMP_GZIP_1;
+ case 9: return COMP_GZIP_9;
+ default: return COMP_GZIP;
+ }
+ break;
+
+ case CRAM_COMP_BZIP2:
+ if (cm->level >= 1 && cm->level <= 9)
+ return COMP_BZIP2_1 + cm->level-1;
+ else
+ return COMP_BZIP2;
+ break;
+
+ case CRAM_COMP_RANS4x8:
+ return cm->order ? COMP_RANS4x8_O1 : COMP_RANS4x8_O0;
+
+ case CRAM_COMP_RANSNx16: {
+ // 8 4x16, 8 32x16 and 2 stripe/cat
+ if (cm->stripe) return COMP_RANSNx16_STRIPE;
+ if (cm->cat) return COMP_RANSNx16_CAT;
+ int c = COMP_RANS4x16_O0;
+ c += 1*cm->order;
+ c += 2*cm->rle;
+ c += 4*cm->pack;
+ c += 8*(cm->Nway==32);
+ return c;
+ }
+
+ case CRAM_COMP_ARITH: {
+ // 8 4x16, 8 32x16 and 2 stripe/cat
+ if (cm->stripe) return COMP_ARITH_STRIPE;
+ if (cm->cat) return COMP_ARITH_CAT;
+ if (cm->ext) return COMP_ARITH_EXT;
+ int c = COMP_ARITH_O0;
+ c += 1*cm->order;
+ c += 2*cm->rle;
+ c += 4*cm->pack;
+ return c;
+ }
+
+ case CRAM_COMP_TOK3:
+ return cm->level < 10
+ ? COMP_TOK3_RANS
+ : COMP_TOK3_ARITH;
+
+ default:
+ // Any unspecialised method
+ return (enum comp_expanded)cm->method;
+ }
+}
+
+// Short form of cram_block_method_int type
+static char comp_method2char[COMP_MAX] =
+ ".gblr0afn" // standard CRAM methods
+ "_G" // gzip
+ "bbbbbbbbB" // bzip2
+ "rR" // rans4x8
+ "010101014545454582" // ransNx16
+ "aAaAaAaAaaa" // arith
+ "nN"; // tok3
+
+// Long form of cram_block_method_int type
+static char *comp_method2str[COMP_MAX] = {
+ // Standard CRAM methods
+ "raw", "gzip", "bzip2", "lzma", "r4x8", "rNx16",
+ "arith", "fqzcomp", "tok3",
+
+ // custom gzip
+ "gzip-min", "gzip-max",
+
+ // custom bzip2
+ "bzip2-1", "bzip2-2", "bzip2-3", "bzip2-4", "bzip2-5",
+ "bzip2-6", "bzip2-7", "bzip2-8", "bzip2-9",
+
+ // rANS 4x8
+ "r4x8-o0", "r4x8-o1",
+
+ // rANS 4x16
+ "r4x16-o0", "r4x16-o1",
+
+ "r4x16-o0R", "r4x16-o1R",
+ "r4x16-o0P", "r4x16-o1P",
+ "r4x16-o0PR", "r4x16-o1PR",
+ "r32x16-o0", "r32x16-o1",
+ "r32x16-o0R", "r32x16-o1R",
+ "r32x16-o0P", "r32x16-o1P",
+ "r32x16-o0PR","r32x16-o1PR",
+ "rNx16-xo0", "rNx16-cat",
+
+ // Arith
+ "arith-o0", "arith-o1",
+ "arith-o0R", "arith-o1R",
+ "arith-o0P", "arith-o1P",
+ "arith-o0PR", "arith-o1PR",
+ "arith-stripe", "arith-cat", "arith-ext",
+
+ // Name tokeniser
+ "tok3-rans", "tok3-arith",
+};
+
+/*----------------------------------------------------------------------
+ * Manipulation and sorting of Block Content-ID arrays and hashes
+ */
+
+typedef struct {
+ int64_t csize[COMP_MAX];
+ int64_t usize[COMP_MAX];
+} cusize_t;
+
+static int64_t total_csize(cusize_t *cu) {
+ int i;
+ int64_t tot = 0;
+ for (i = 0; i < COMP_MAX; i++)
+ tot += cu->csize[i];
+ return tot;
+}
+
+static int64_t total_usize(cusize_t *cu) {
+ int i;
+ int64_t tot = 0;
+ for (i = 0; i < COMP_MAX; i++)
+ tot += cu->usize[i];
+ return tot;
+}
+
+// cusize_t array and sorting by compressed size
+static cusize_t *sort_cusize_global; // avoids a messy extra data type
+static int sort_cusize_compar(const void *i1, const void *i2) {
+ int64_t n = sort_cusize_global->csize[*(const int *)i2] -
+ sort_cusize_global->csize[*(const int *)i1];
+ return n > 0 ? 1 : (n < 0 ? -1 : *(const int *)i1 - *(const int *)i2);
+}
+
+// Sort a cusize array by size of used method.
+// Returns cu->csize[comp] indices in descending size, as static mem
+static int *sort_cusize(cusize_t *cu) {
+ static int idx[COMP_MAX];
+ int i;
+ for (i = 0; i < COMP_MAX; i++)
+ idx[i] = i;
+ sort_cusize_global = cu;
+ qsort(idx, COMP_MAX, sizeof(*idx), sort_cusize_compar);
+
+ return idx;
+}
+
+// Hash table of cusize_t and sorting by key (content-id)
+KHASH_MAP_INIT_INT(cu, cusize_t)
+
+/* Sort by hash key. Global due to rubbish qsort API, but it's simple. */
+static khash_t(cu) *global_cu_hash = NULL;
+static int cu_compar(const void *i1, const void *i2) {
+ return kh_key(global_cu_hash, *(const int *)i1) -
+ kh_key(global_cu_hash, *(const int *)i2);
+}
+
+/*----------------------------------------------------------------------
+ * Main cram_size reporting and aggregation
+ */
+static off_t report_size(FILE *outfp, int verbose, int ref_seq_blk,
+ khash_t(cu) *cu_size, cram_cid2ds_t *cid2ds) {
+ if (!cu_size || !cid2ds)
+ return -1;
+
+ khiter_t k;
+ off_t tot_size = 0;
+
+ fprintf(outfp, "# Content_ID Uncomp.size Comp.size Ratio Method%.*s Data_series\n", verbose ? 4 : 0, " ");
+ int *sorted_blocks = malloc(kh_end(cu_size)*sizeof(int));
+ if (!sorted_blocks)
+ return -1;
+ int nblocks = 0;
+ for (k = kh_begin(cu_size); k != kh_end(cu_size); k++) {
+ if (!kh_exist(cu_size, k))
+ continue;
+ sorted_blocks[nblocks++] = k;
+ }
+ global_cu_hash = cu_size;
+ qsort(sorted_blocks, nblocks, sizeof(int), cu_compar);
+
+ int i;
+ for (i = 0; i < nblocks; i++) {
+ k = sorted_blocks[i];
+
+ if (verbose) {
+ // FULL output
+ int *comp_idx = sort_cusize(&kh_value(cu_size, k));
+ int first_line = 1, c, j;
+ for (c = 0; c < COMP_MAX; c++) {
+ int comp = comp_idx[c];
+ if (!kh_value(cu_size, k).csize[comp] && c)
+ break;
+
+ if (!first_line)
+ fprintf(outfp, "\n");
+ first_line = 0;
+
+ if ((int)kh_key(cu_size, k) < 0)
+ fprintf(outfp, "BLOCK %8s", "CORE");
+ else
+ fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k));
+
+ fprintf(outfp, " %12"PRId64" %12"PRId64,
+ kh_value(cu_size, k).usize[comp],
+ kh_value(cu_size, k).csize[comp]);
+ double f = (100.0*(kh_value(cu_size, k).csize[comp]+.0001)) /
+ (kh_value(cu_size, k).usize[comp]+.0001);
+ if (f > 999)
+ fprintf(outfp, " >999%% %-11s", comp_method2str[comp]);
+ else
+ fprintf(outfp, " %6.2f%% %-11s",f, comp_method2str[comp]);
+
+ int n, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n);
+ for (j = 0; j < n; j++) {
+ int d = dsa[j];
+ if (d > 65535)
+ fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff);
+ else
+ fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff);
+ }
+ }
+ } else {
+ // aggregate by compression type.
+ int64_t csize = total_csize(&kh_value(cu_size, k));
+ int64_t usize = total_usize(&kh_value(cu_size, k));
+ int *comp_idx = sort_cusize(&kh_value(cu_size, k));
+
+ char cstr[COMP_MAX+1] = {0};
+ int cidx = 0, c;
+ for (c = 0; c < COMP_MAX; c++) {
+ if (!kh_value(cu_size, k).csize[comp_idx[c]])
+ break;
+ cstr[cidx++] = comp_method2char[comp_idx[c]];
+ }
+ if (!*cstr) *cstr = '.';
+
+ if ((int)kh_key(cu_size, k) < 0)
+ fprintf(outfp, "BLOCK %8s", "CORE");
+ else
+ fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k));
+ fprintf(outfp, " %12"PRId64" %12"PRId64, usize, csize);
+ double f = 100*(csize+.0001)/(usize+.0001);
+ if (f > 999)
+ fprintf(outfp, " >999%% %-7s", cstr);
+ else
+ fprintf(outfp, " %6.2f%% %-7s", f, cstr);
+
+ int n, j, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n);
+ for (j = 0; j < n; j++) {
+ int d = dsa[j];
+ if (d > 65535)
+ fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff);
+ else
+ fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff);
+ }
+ }
+
+ if ((int)kh_key(cu_size, k) >= 0 &&
+ (int)kh_key(cu_size, k) == ref_seq_blk) {
+ fprintf(outfp, " embedded_ref");
+ }
+ fprintf(outfp, "\n");
+
+ tot_size += total_csize(&kh_value(cu_size, k));
+ }
+
+ free(sorted_blocks);
+
+ return tot_size;
+}
+
+/* Main processing loop */
+static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp,
+ int verbose, int encodings) {
+ cram_fd *in_c;
+ cram_container *c = NULL;
+ cram_block *blk = NULL;
+ cram_block_slice_hdr *shdr = NULL;
+ khiter_t k;
+ int ret;
+ cram_cid2ds_t *cid2ds = NULL;
+ khash_t(cu) *cu_size = kh_init(cu);
+ int ref_seq_blk_used = -1;
+ int64_t nseqs = 0, nbases = 0, ncont = 0, nslice = 0;
+
+ if (!in->is_cram) {
+ print_error("cram_size", "Input is not a CRAM file");
+ goto err;
+ }
+ in_c = in->fp.cram; // low level htslib abuse?
+ while ((c = cram_read_container(in_c))) {
+ if (cram_container_is_empty(in_c)) {
+ cram_block *blk;
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ cram_free_block(blk);
+ cram_free_container(c);
+ c = NULL; blk = NULL;
+ continue;
+ }
+
+ nseqs += cram_container_get_num_records(c);
+ nbases += cram_container_get_num_bases(c);
+
+ // Container compression header
+ int32_t num_slices;
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+
+ // Decode compression header...
+ cram_block_compression_hdr *chdr;
+ chdr = cram_decode_compression_header(in_c, blk);
+
+ if (encodings) {
+ kstring_t ks = KS_INITIALIZE;
+ if (cram_describe_encodings(chdr, &ks) < 0)
+ goto err;
+
+ fprintf(outfp, "Container encodings\n%s\n", ks_str(&ks));
+
+ ks_free(&ks);
+ }
+
+ cid2ds = cram_update_cid2ds_map(chdr, cid2ds);
+
+ cram_free_block(blk);
+ blk = NULL;
+
+ cram_free_compression_header(chdr);
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ (void)cram_container_get_landmarks(c, &num_slices);
+ ncont++;
+ nslice += num_slices;
+
+ int i, j;
+ for (i = 0; i < num_slices; i++) {
+ // Slice header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ if (!(shdr = cram_decode_slice_header(in_c, blk)))
+ goto err;
+ cram_free_block(blk);
+ blk = NULL;
+
+ int ref_seq_blk = cram_slice_hdr_get_embed_ref_id(shdr);
+ int num_blocks = cram_slice_hdr_get_num_blocks(shdr);
+
+ // Embedded reference. Check it's consistent (if used this is
+ // an almost guaranteed certainty, so we take the easy route).
+ if (ref_seq_blk >= 0) {
+ if (ref_seq_blk_used == -1)
+ ref_seq_blk_used = ref_seq_blk;
+ else if (ref_seq_blk_used != ref_seq_blk)
+ fprintf(stderr, "Embedded reference is not consistently using the same Content-Id.\n"
+ "Reported figures for reference will be invalid.\n");
+ }
+
+ // Slice data blocks
+ for (j = 0; j < num_blocks; j++) {
+ // read and discard, unless it's the ref-ID block
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+
+ int32_t csize = cram_block_get_comp_size(blk);
+ int32_t usize = cram_block_get_uncomp_size(blk);
+ int cid = cram_block_get_content_id(blk);
+ enum cram_block_method method = cram_block_get_method(blk);
+
+ // Expand comp to the internal sub-formats, eg
+ // rANS order-0/1, PACK+RLE, etc.
+ cram_method_details *cm;
+ cm = cram_expand_method(cram_block_get_data(blk),
+ cram_block_get_comp_size(blk),
+ method);
+ if (!cm)
+ goto err;
+ enum comp_expanded comp
+ = comp_method2expanded(cm);
+ free(cm);
+
+ k = kh_put(cu, cu_size, cid, &ret);
+ if (ret < 0)
+ goto err;
+ if (ret == 0) {
+ kh_value(cu_size, k).csize[comp] += csize;
+ kh_value(cu_size, k).usize[comp] += usize;
+ } else {
+ memset(&kh_value(cu_size, k), 0, sizeof(cusize_t));
+ kh_value(cu_size, k).csize[comp] = csize;
+ kh_value(cu_size, k).usize[comp] = usize;
+ }
+
+ cram_free_block(blk);
+ blk = NULL;
+ }
+ cram_free_slice_header(shdr);
+ shdr = NULL;
+ }
+
+ cram_free_container(c);
+ c = NULL;
+ }
+
+ off_t tot_size = report_size(outfp, verbose, ref_seq_blk_used,
+ cu_size, cid2ds);
+ if (tot_size < 0)
+ goto err;
+
+ kh_destroy(cu, cu_size);
+ cram_cid2ds_free(cid2ds);
+
+ off_t end = htell(hf_in);
+
+ fprintf(outfp, "\n");
+ fprintf(outfp, "Number of containers %18"PRId64"\n", ncont);
+ fprintf(outfp, "Number of slices %18"PRId64"\n", nslice);
+ fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs);
+ fprintf(outfp, "Number of bases %18"PRId64"\n", nbases);
+ fprintf(outfp, "Total file size %18"PRId64"\n", end);
+ fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size);
+
+ return 0;
+
+ err:
+ // Report anyway so we can get stats on partial files, but be
+ // sure to error too.
+ report_size(outfp, verbose, ref_seq_blk_used, cu_size, cid2ds);
+
+ print_error("cram_size", "Failed in decoding CRAM file");
+ if (blk)
+ cram_free_block(blk);
+ if (shdr)
+ cram_free_slice_header(shdr);
+ if (c)
+ cram_free_container(c);
+ if (cid2ds)
+ cram_cid2ds_free(cid2ds);
+
+ return -1;
+}
+
+/* main() for cram_size */
+int main_cram_size(int argc, char *argv[]) {
+ int c, usage = 0, verbose = 0, encodings = 0;
+ sam_hdr_t *h = 0;
+ hFILE *hf_in = NULL;
+ samFile *in = NULL;
+ sam_global_args ga;
+ FILE *outfp = stdout;
+
+ static const struct option lopts[] = {
+ {"output", required_argument, NULL, 'o'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"encodings", no_argument, NULL, 'e'},
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '-'),
+ { NULL, 0, NULL, 0 }
+ };
+
+ sam_global_args_init(&ga);
+
+ while ((c = getopt_long(argc, argv, "vo:e", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'o':
+ if (!(outfp = fopen(optarg, "w"))) {
+ perror(optarg);
+ goto err;
+ }
+ break;
+
+ case 'v':
+ verbose++;
+ break;
+
+ case 'e':
+ encodings++;
+ break;
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
+ }
+ }
+
+ if ((optind == argc && isatty(0)) || usage) {
+ printf("Usage: samtools cram_size [-ve] [-o out.size] [in.cram]\n");
+ return 0;
+ }
+
+ char *fn = optind < argc ? argv[optind] : "-";
+
+ // We want access to in->fp.cram->fp, but this is an opaque struct so we
+ // can't get that. However we opened with hopen and then reopen as
+ // CRAM with hts_hopen, which will swallow the initial hFILE and take
+ // owenership of it. Hence we now know in->fp.cram->fp.
+ if (!(hf_in = hopen(fn, "r"))) {
+ print_error_errno("cram_size", "failed to open file '%s'", fn);
+ return 1;
+ }
+ if (!(in = hts_hopen(hf_in, fn, "r"))) {
+ print_error_errno("cram_size", "failed to open file '%s'", fn);
+ goto err;
+ }
+
+ if (!(h = sam_hdr_read(in)))
+ goto err;
+
+ int ret = cram_size(hf_in, in, h, outfp, verbose, encodings);
+ sam_hdr_destroy(h);
+ sam_close(in);
+ if (outfp != stdout)
+ fclose(outfp);
+
+ return ret ? 1 : 0;
+
+ err:
+ if (in)
+ sam_close(in);
+ if (h)
+ sam_hdr_destroy(h);
+
+ return 1;
+}
--- /dev/null
+#include "samtools.pysam.h"
+
+/* cram_size.c -- produces summary of the size of each cram data-series
+
+ Copyright (C) 2023 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+// TODO: add range query. Eg the ability to look at size for "*" only
+// (unmapped), or in a specific region such as a centromere.
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <strings.h>
+
+#include "htslib/bgzf.h"
+#include "htslib/sam.h"
+#include "htslib/cram.h"
+#include "htslib/kstring.h"
+#include "htslib/khash.h"
+#include "samtools.h"
+#include "sam_opts.h"
+#include "htslib/hfile.h"
+
+/*----------------------------------------------------------------------
+ * Compression method handling
+ */
+
+// A numeric version of the cram_method_details struct.
+// We expand the myriad of struct field combinations into a single
+// enumerated type so we can index and accumulate statistics for
+// purposes of reporting.
+//
+// These expanded numeric values have no definition within CRAM itself
+// and never occur within the file format.
+enum comp_expanded {
+ //----
+ // Copies from htslib cram_block_method enum
+ COMP_RAW = CRAM_COMP_RAW,
+ COMP_GZIP = CRAM_COMP_GZIP,
+ COMP_BZIP2 = CRAM_COMP_BZIP2,
+ COMP_LZMA = CRAM_COMP_LZMA,
+ COMP_RANS8 = CRAM_COMP_RANS4x8,
+ COMP_RANS16 = CRAM_COMP_RANSNx16,
+ COMP_ARITH = CRAM_COMP_ARITH,
+ COMP_FQZ = CRAM_COMP_FQZ,
+ COMP_TOK3 = CRAM_COMP_TOK3,
+
+ //----
+ // Localised variants.
+
+ // Gzip
+ COMP_GZIP_1,
+ COMP_GZIP_9,
+
+ // Bzip2
+ COMP_BZIP2_1,
+ COMP_BZIP2_2,
+ COMP_BZIP2_3,
+ COMP_BZIP2_4,
+ COMP_BZIP2_5,
+ COMP_BZIP2_6,
+ COMP_BZIP2_7,
+ COMP_BZIP2_8,
+ COMP_BZIP2_9,
+
+ // rans 4x8
+ COMP_RANS4x8_O0,
+ COMP_RANS4x8_O1,
+
+ // rans Nx16. Note order here is to enable selection via bit-fields
+ // bit 0: O0/O1
+ // bit 1: RLE
+ // bit 2: PACK
+ // bit 3: 32x16
+ COMP_RANS4x16_O0,
+ COMP_RANS4x16_O1,
+ COMP_RANS4x16_O0R, // +RLE
+ COMP_RANS4x16_O1R,
+ COMP_RANS4x16_O0P, // +PACK
+ COMP_RANS4x16_O1P,
+ COMP_RANS4x16_O0PR, // +PACK+RLE
+ COMP_RANS4x16_O1PR,
+ COMP_RANS32x16_O0, // SIMD variants
+ COMP_RANS32x16_O1,
+ COMP_RANS32x16_O0R, // +RLE
+ COMP_RANS32x16_O1R,
+ COMP_RANS32x16_O0P, // +PACK
+ COMP_RANS32x16_O1P,
+ COMP_RANS32x16_O0PR, // +PACK+RLE
+ COMP_RANS32x16_O1PR,
+ COMP_RANSNx16_STRIPE,
+ COMP_RANSNx16_CAT,
+
+ // Arith
+ COMP_ARITH_O0,
+ COMP_ARITH_O1,
+ COMP_ARITH_O0R, // +RLE
+ COMP_ARITH_O1R,
+ COMP_ARITH_O0P, // +PACK
+ COMP_ARITH_O1P,
+ COMP_ARITH_O0PR, // +PACK+RLE
+ COMP_ARITH_O1PR,
+ COMP_ARITH_STRIPE,
+ COMP_ARITH_CAT, // no entropy encoder
+ COMP_ARITH_EXT, // external entropy encode
+
+ // Nake tokeniser
+ COMP_TOK3_RANS,
+ COMP_TOK3_ARITH,
+
+ // To mark maximum size
+ COMP_MAX,
+};
+
+static enum comp_expanded comp_method2expanded(cram_method_details *cm) {
+ switch (cm->method) {
+ case CRAM_COMP_GZIP:
+ switch (cm->level) {
+ case 1: return COMP_GZIP_1;
+ case 9: return COMP_GZIP_9;
+ default: return COMP_GZIP;
+ }
+ break;
+
+ case CRAM_COMP_BZIP2:
+ if (cm->level >= 1 && cm->level <= 9)
+ return COMP_BZIP2_1 + cm->level-1;
+ else
+ return COMP_BZIP2;
+ break;
+
+ case CRAM_COMP_RANS4x8:
+ return cm->order ? COMP_RANS4x8_O1 : COMP_RANS4x8_O0;
+
+ case CRAM_COMP_RANSNx16: {
+ // 8 4x16, 8 32x16 and 2 stripe/cat
+ if (cm->stripe) return COMP_RANSNx16_STRIPE;
+ if (cm->cat) return COMP_RANSNx16_CAT;
+ int c = COMP_RANS4x16_O0;
+ c += 1*cm->order;
+ c += 2*cm->rle;
+ c += 4*cm->pack;
+ c += 8*(cm->Nway==32);
+ return c;
+ }
+
+ case CRAM_COMP_ARITH: {
+ // 8 4x16, 8 32x16 and 2 stripe/cat
+ if (cm->stripe) return COMP_ARITH_STRIPE;
+ if (cm->cat) return COMP_ARITH_CAT;
+ if (cm->ext) return COMP_ARITH_EXT;
+ int c = COMP_ARITH_O0;
+ c += 1*cm->order;
+ c += 2*cm->rle;
+ c += 4*cm->pack;
+ return c;
+ }
+
+ case CRAM_COMP_TOK3:
+ return cm->level < 10
+ ? COMP_TOK3_RANS
+ : COMP_TOK3_ARITH;
+
+ default:
+ // Any unspecialised method
+ return (enum comp_expanded)cm->method;
+ }
+}
+
+// Short form of cram_block_method_int type
+static char comp_method2char[COMP_MAX] =
+ ".gblr0afn" // standard CRAM methods
+ "_G" // gzip
+ "bbbbbbbbB" // bzip2
+ "rR" // rans4x8
+ "010101014545454582" // ransNx16
+ "aAaAaAaAaaa" // arith
+ "nN"; // tok3
+
+// Long form of cram_block_method_int type
+static char *comp_method2str[COMP_MAX] = {
+ // Standard CRAM methods
+ "raw", "gzip", "bzip2", "lzma", "r4x8", "rNx16",
+ "arith", "fqzcomp", "tok3",
+
+ // custom gzip
+ "gzip-min", "gzip-max",
+
+ // custom bzip2
+ "bzip2-1", "bzip2-2", "bzip2-3", "bzip2-4", "bzip2-5",
+ "bzip2-6", "bzip2-7", "bzip2-8", "bzip2-9",
+
+ // rANS 4x8
+ "r4x8-o0", "r4x8-o1",
+
+ // rANS 4x16
+ "r4x16-o0", "r4x16-o1",
+
+ "r4x16-o0R", "r4x16-o1R",
+ "r4x16-o0P", "r4x16-o1P",
+ "r4x16-o0PR", "r4x16-o1PR",
+ "r32x16-o0", "r32x16-o1",
+ "r32x16-o0R", "r32x16-o1R",
+ "r32x16-o0P", "r32x16-o1P",
+ "r32x16-o0PR","r32x16-o1PR",
+ "rNx16-xo0", "rNx16-cat",
+
+ // Arith
+ "arith-o0", "arith-o1",
+ "arith-o0R", "arith-o1R",
+ "arith-o0P", "arith-o1P",
+ "arith-o0PR", "arith-o1PR",
+ "arith-stripe", "arith-cat", "arith-ext",
+
+ // Name tokeniser
+ "tok3-rans", "tok3-arith",
+};
+
+/*----------------------------------------------------------------------
+ * Manipulation and sorting of Block Content-ID arrays and hashes
+ */
+
+typedef struct {
+ int64_t csize[COMP_MAX];
+ int64_t usize[COMP_MAX];
+} cusize_t;
+
+static int64_t total_csize(cusize_t *cu) {
+ int i;
+ int64_t tot = 0;
+ for (i = 0; i < COMP_MAX; i++)
+ tot += cu->csize[i];
+ return tot;
+}
+
+static int64_t total_usize(cusize_t *cu) {
+ int i;
+ int64_t tot = 0;
+ for (i = 0; i < COMP_MAX; i++)
+ tot += cu->usize[i];
+ return tot;
+}
+
+// cusize_t array and sorting by compressed size
+static cusize_t *sort_cusize_global; // avoids a messy extra data type
+static int sort_cusize_compar(const void *i1, const void *i2) {
+ int64_t n = sort_cusize_global->csize[*(const int *)i2] -
+ sort_cusize_global->csize[*(const int *)i1];
+ return n > 0 ? 1 : (n < 0 ? -1 : *(const int *)i1 - *(const int *)i2);
+}
+
+// Sort a cusize array by size of used method.
+// Returns cu->csize[comp] indices in descending size, as static mem
+static int *sort_cusize(cusize_t *cu) {
+ static int idx[COMP_MAX];
+ int i;
+ for (i = 0; i < COMP_MAX; i++)
+ idx[i] = i;
+ sort_cusize_global = cu;
+ qsort(idx, COMP_MAX, sizeof(*idx), sort_cusize_compar);
+
+ return idx;
+}
+
+// Hash table of cusize_t and sorting by key (content-id)
+KHASH_MAP_INIT_INT(cu, cusize_t)
+
+/* Sort by hash key. Global due to rubbish qsort API, but it's simple. */
+static khash_t(cu) *global_cu_hash = NULL;
+static int cu_compar(const void *i1, const void *i2) {
+ return kh_key(global_cu_hash, *(const int *)i1) -
+ kh_key(global_cu_hash, *(const int *)i2);
+}
+
+/*----------------------------------------------------------------------
+ * Main cram_size reporting and aggregation
+ */
+static off_t report_size(FILE *outfp, int verbose, int ref_seq_blk,
+ khash_t(cu) *cu_size, cram_cid2ds_t *cid2ds) {
+ if (!cu_size || !cid2ds)
+ return -1;
+
+ khiter_t k;
+ off_t tot_size = 0;
+
+ fprintf(outfp, "# Content_ID Uncomp.size Comp.size Ratio Method%.*s Data_series\n", verbose ? 4 : 0, " ");
+ int *sorted_blocks = malloc(kh_end(cu_size)*sizeof(int));
+ if (!sorted_blocks)
+ return -1;
+ int nblocks = 0;
+ for (k = kh_begin(cu_size); k != kh_end(cu_size); k++) {
+ if (!kh_exist(cu_size, k))
+ continue;
+ sorted_blocks[nblocks++] = k;
+ }
+ global_cu_hash = cu_size;
+ qsort(sorted_blocks, nblocks, sizeof(int), cu_compar);
+
+ int i;
+ for (i = 0; i < nblocks; i++) {
+ k = sorted_blocks[i];
+
+ if (verbose) {
+ // FULL output
+ int *comp_idx = sort_cusize(&kh_value(cu_size, k));
+ int first_line = 1, c, j;
+ for (c = 0; c < COMP_MAX; c++) {
+ int comp = comp_idx[c];
+ if (!kh_value(cu_size, k).csize[comp] && c)
+ break;
+
+ if (!first_line)
+ fprintf(outfp, "\n");
+ first_line = 0;
+
+ if ((int)kh_key(cu_size, k) < 0)
+ fprintf(outfp, "BLOCK %8s", "CORE");
+ else
+ fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k));
+
+ fprintf(outfp, " %12"PRId64" %12"PRId64,
+ kh_value(cu_size, k).usize[comp],
+ kh_value(cu_size, k).csize[comp]);
+ double f = (100.0*(kh_value(cu_size, k).csize[comp]+.0001)) /
+ (kh_value(cu_size, k).usize[comp]+.0001);
+ if (f > 999)
+ fprintf(outfp, " >999%% %-11s", comp_method2str[comp]);
+ else
+ fprintf(outfp, " %6.2f%% %-11s",f, comp_method2str[comp]);
+
+ int n, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n);
+ for (j = 0; j < n; j++) {
+ int d = dsa[j];
+ if (d > 65535)
+ fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff);
+ else
+ fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff);
+ }
+ }
+ } else {
+ // aggregate by compression type.
+ int64_t csize = total_csize(&kh_value(cu_size, k));
+ int64_t usize = total_usize(&kh_value(cu_size, k));
+ int *comp_idx = sort_cusize(&kh_value(cu_size, k));
+
+ char cstr[COMP_MAX+1] = {0};
+ int cidx = 0, c;
+ for (c = 0; c < COMP_MAX; c++) {
+ if (!kh_value(cu_size, k).csize[comp_idx[c]])
+ break;
+ cstr[cidx++] = comp_method2char[comp_idx[c]];
+ }
+ if (!*cstr) *cstr = '.';
+
+ if ((int)kh_key(cu_size, k) < 0)
+ fprintf(outfp, "BLOCK %8s", "CORE");
+ else
+ fprintf(outfp, "BLOCK %8d", kh_key(cu_size, k));
+ fprintf(outfp, " %12"PRId64" %12"PRId64, usize, csize);
+ double f = 100*(csize+.0001)/(usize+.0001);
+ if (f > 999)
+ fprintf(outfp, " >999%% %-7s", cstr);
+ else
+ fprintf(outfp, " %6.2f%% %-7s", f, cstr);
+
+ int n, j, *dsa = cram_cid2ds_query(cid2ds, kh_key(cu_size, k), &n);
+ for (j = 0; j < n; j++) {
+ int d = dsa[j];
+ if (d > 65535)
+ fprintf(outfp, " %c%c%c", d>>16, (d>>8)&0xff, d&0xff);
+ else
+ fprintf(outfp, " %c%c", (d>>8)&0xff, d&0xff);
+ }
+ }
+
+ if ((int)kh_key(cu_size, k) >= 0 &&
+ (int)kh_key(cu_size, k) == ref_seq_blk) {
+ fprintf(outfp, " embedded_ref");
+ }
+ fprintf(outfp, "\n");
+
+ tot_size += total_csize(&kh_value(cu_size, k));
+ }
+
+ free(sorted_blocks);
+
+ return tot_size;
+}
+
+/* Main processing loop */
+static int cram_size(hFILE *hf_in, samFile *in, sam_hdr_t *h, FILE *outfp,
+ int verbose, int encodings) {
+ cram_fd *in_c;
+ cram_container *c = NULL;
+ cram_block *blk = NULL;
+ cram_block_slice_hdr *shdr = NULL;
+ khiter_t k;
+ int ret;
+ cram_cid2ds_t *cid2ds = NULL;
+ khash_t(cu) *cu_size = kh_init(cu);
+ int ref_seq_blk_used = -1;
+ int64_t nseqs = 0, nbases = 0, ncont = 0, nslice = 0;
+
+ if (!in->is_cram) {
+ print_error("cram_size", "Input is not a CRAM file");
+ goto err;
+ }
+ in_c = in->fp.cram; // low level htslib abuse?
+ while ((c = cram_read_container(in_c))) {
+ if (cram_container_is_empty(in_c)) {
+ cram_block *blk;
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ cram_free_block(blk);
+ cram_free_container(c);
+ c = NULL; blk = NULL;
+ continue;
+ }
+
+ nseqs += cram_container_get_num_records(c);
+ nbases += cram_container_get_num_bases(c);
+
+ // Container compression header
+ int32_t num_slices;
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+
+ // Decode compression header...
+ cram_block_compression_hdr *chdr;
+ chdr = cram_decode_compression_header(in_c, blk);
+
+ if (encodings) {
+ kstring_t ks = KS_INITIALIZE;
+ if (cram_describe_encodings(chdr, &ks) < 0)
+ goto err;
+
+ fprintf(outfp, "Container encodings\n%s\n", ks_str(&ks));
+
+ ks_free(&ks);
+ }
+
+ cid2ds = cram_update_cid2ds_map(chdr, cid2ds);
+
+ cram_free_block(blk);
+ blk = NULL;
+
+ cram_free_compression_header(chdr);
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ (void)cram_container_get_landmarks(c, &num_slices);
+ ncont++;
+ nslice += num_slices;
+
+ int i, j;
+ for (i = 0; i < num_slices; i++) {
+ // Slice header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ if (!(shdr = cram_decode_slice_header(in_c, blk)))
+ goto err;
+ cram_free_block(blk);
+ blk = NULL;
+
+ int ref_seq_blk = cram_slice_hdr_get_embed_ref_id(shdr);
+ int num_blocks = cram_slice_hdr_get_num_blocks(shdr);
+
+ // Embedded reference. Check it's consistent (if used this is
+ // an almost guaranteed certainty, so we take the easy route).
+ if (ref_seq_blk >= 0) {
+ if (ref_seq_blk_used == -1)
+ ref_seq_blk_used = ref_seq_blk;
+ else if (ref_seq_blk_used != ref_seq_blk)
+ fprintf(samtools_stderr, "Embedded reference is not consistently using the same Content-Id.\n"
+ "Reported figures for reference will be invalid.\n");
+ }
+
+ // Slice data blocks
+ for (j = 0; j < num_blocks; j++) {
+ // read and discard, unless it's the ref-ID block
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+
+ int32_t csize = cram_block_get_comp_size(blk);
+ int32_t usize = cram_block_get_uncomp_size(blk);
+ int cid = cram_block_get_content_id(blk);
+ enum cram_block_method method = cram_block_get_method(blk);
+
+ // Expand comp to the internal sub-formats, eg
+ // rANS order-0/1, PACK+RLE, etc.
+ cram_method_details *cm;
+ cm = cram_expand_method(cram_block_get_data(blk),
+ cram_block_get_comp_size(blk),
+ method);
+ if (!cm)
+ goto err;
+ enum comp_expanded comp
+ = comp_method2expanded(cm);
+ free(cm);
+
+ k = kh_put(cu, cu_size, cid, &ret);
+ if (ret < 0)
+ goto err;
+ if (ret == 0) {
+ kh_value(cu_size, k).csize[comp] += csize;
+ kh_value(cu_size, k).usize[comp] += usize;
+ } else {
+ memset(&kh_value(cu_size, k), 0, sizeof(cusize_t));
+ kh_value(cu_size, k).csize[comp] = csize;
+ kh_value(cu_size, k).usize[comp] = usize;
+ }
+
+ cram_free_block(blk);
+ blk = NULL;
+ }
+ cram_free_slice_header(shdr);
+ shdr = NULL;
+ }
+
+ cram_free_container(c);
+ c = NULL;
+ }
+
+ off_t tot_size = report_size(outfp, verbose, ref_seq_blk_used,
+ cu_size, cid2ds);
+ if (tot_size < 0)
+ goto err;
+
+ kh_destroy(cu, cu_size);
+ cram_cid2ds_free(cid2ds);
+
+ off_t end = htell(hf_in);
+
+ fprintf(outfp, "\n");
+ fprintf(outfp, "Number of containers %18"PRId64"\n", ncont);
+ fprintf(outfp, "Number of slices %18"PRId64"\n", nslice);
+ fprintf(outfp, "Number of sequences %18"PRId64"\n", nseqs);
+ fprintf(outfp, "Number of bases %18"PRId64"\n", nbases);
+ fprintf(outfp, "Total file size %18"PRId64"\n", end);
+ fprintf(outfp, "Format overhead size %18"PRId64"\n", end - tot_size);
+
+ return 0;
+
+ err:
+ // Report anyway so we can get stats on partial files, but be
+ // sure to error too.
+ report_size(outfp, verbose, ref_seq_blk_used, cu_size, cid2ds);
+
+ print_error("cram_size", "Failed in decoding CRAM file");
+ if (blk)
+ cram_free_block(blk);
+ if (shdr)
+ cram_free_slice_header(shdr);
+ if (c)
+ cram_free_container(c);
+ if (cid2ds)
+ cram_cid2ds_free(cid2ds);
+
+ return -1;
+}
+
+/* main() for cram_size */
+int main_cram_size(int argc, char *argv[]) {
+ int c, usage = 0, verbose = 0, encodings = 0;
+ sam_hdr_t *h = 0;
+ hFILE *hf_in = NULL;
+ samFile *in = NULL;
+ sam_global_args ga;
+ FILE *outfp = samtools_stdout;
+
+ static const struct option lopts[] = {
+ {"output", required_argument, NULL, 'o'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"encodings", no_argument, NULL, 'e'},
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '-'),
+ { NULL, 0, NULL, 0 }
+ };
+
+ sam_global_args_init(&ga);
+
+ while ((c = getopt_long(argc, argv, "vo:e", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'o':
+ if (!(outfp = fopen(optarg, "w"))) {
+ perror(optarg);
+ goto err;
+ }
+ break;
+
+ case 'v':
+ verbose++;
+ break;
+
+ case 'e':
+ encodings++;
+ break;
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
+ }
+ }
+
+ if ((optind == argc && isatty(0)) || usage) {
+ fprintf(samtools_stdout, "Usage: samtools cram_size [-ve] [-o out.size] [in.cram]\n");
+ return 0;
+ }
+
+ char *fn = optind < argc ? argv[optind] : "-";
+
+ // We want access to in->fp.cram->fp, but this is an opaque struct so we
+ // can't get that. However we opened with hopen and then reopen as
+ // CRAM with hts_hopen, which will swallow the initial hFILE and take
+ // owenership of it. Hence we now know in->fp.cram->fp.
+ if (!(hf_in = hopen(fn, "r"))) {
+ print_error_errno("cram_size", "failed to open file '%s'", fn);
+ return 1;
+ }
+ if (!(in = hts_hopen(hf_in, fn, "r"))) {
+ print_error_errno("cram_size", "failed to open file '%s'", fn);
+ goto err;
+ }
+
+ if (!(h = sam_hdr_read(in)))
+ goto err;
+
+ int ret = cram_size(hf_in, in, h, outfp, verbose, encodings);
+ sam_hdr_destroy(h);
+ sam_close(in);
+ if (outfp != samtools_stdout)
+ fclose(outfp);
+
+ return ret ? 1 : 0;
+
+ err:
+ if (in)
+ sam_close(in);
+ if (h)
+ sam_hdr_destroy(h);
+
+ return 1;
+}
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <htslib/kstring.h>
#include "samtools.h"
-#define DEFAULT_FASTA_LINE_LEN 60
+// Negative indicates the same as input data
+#define DEFAULT_FASTA_LINE_LEN -60
+
+#ifndef ABS
+# define ABS(x) ((x)>=0?(x):-(x))
+#endif
static unsigned char comp_base[256] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name,
- const int ignore, const int length, const hts_pos_t seq_len) {
+ const int ignore, const hts_pos_t length, const hts_pos_t seq_len) {
int id;
hts_pos_t beg, end;
} else if (seq_len == 0) {
fprintf(stderr, "[faidx] Zero length sequence: %s\n", name);
} else if (fai_parse_region(faid, name, &id, &beg, &end, 0)
- && (end < INT_MAX) && (seq_len != end - beg)) {
+ && (end < HTS_POS_MAX) && (seq_len != end - beg)) {
fprintf(stderr, "[faidx] Truncated sequence: %s\n", name);
}
static int write_output(faidx_t *faid, FILE *file, const char *name, const int ignore,
- const int length, const int rev,
+ const hts_pos_t length, const int rev,
const char *pos_strand_name, const char *neg_strand_name,
enum fai_format_options format) {
- hts_pos_t seq_len;
+ hts_pos_t seq_len, wrap_len = length;
+ if (wrap_len < 0)
+ wrap_len = fai_line_length(faid, name);
+ if (wrap_len <= 0)
+ wrap_len = HTS_POS_MAX;
char *seq = fai_fetch64(faid, name, &seq_len);
if (format == FAI_FASTA) {
reverse_complement(seq, seq_len);
}
- if (write_line(faid, file, seq, name, ignore, length, seq_len)
+ if (write_line(faid, file, seq, name, ignore, wrap_len, seq_len)
== EXIT_FAILURE) {
free(seq);
return EXIT_FAILURE;
reverse(qual, seq_len);
}
- if (write_line(faid, file, qual, name, ignore, length, seq_len)
+ if (write_line(faid, file, qual, name, ignore, wrap_len, seq_len)
== EXIT_FAILURE) {
free(qual);
return EXIT_FAILURE;
static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, const int ignore,
- const int length, const int rev,
+ const hts_pos_t length, const int rev,
const char *pos_strand_name,
const char *neg_strand_name,
enum fai_format_options format) {
int faidx_core(int argc, char *argv[], enum fai_format_options format)
{
int c, ignore_error = 0, rev = 0;
- int line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */
+ hts_pos_t line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */
char* output_file = NULL; /* output file (default is stdout ) */
char *region_file = NULL; // list of regions from file, one per line
char *pos_strand_name = ""; // Extension to add to name for +ve strand
while ((c = getopt_long(argc, argv, "ho:n:cr:fi", lopts, NULL)) >= 0) {
switch (c) {
case 'o': output_file = optarg; break;
- case 'n': line_len = atoi(optarg);
- if(line_len<1) {
- fprintf(stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,DEFAULT_FASTA_LINE_LEN);
- line_len= DEFAULT_FASTA_LINE_LEN ;
- }
+ case 'n': line_len = strtol(optarg, NULL, 10);
+ if (line_len < 0) {
+ fprintf(stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,ABS(DEFAULT_FASTA_LINE_LEN));
+ line_len= ABS(DEFAULT_FASTA_LINE_LEN);
+ }
break;
case 'c': ignore_error = 1; break;
case 'r': region_file = optarg; break;
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013, 2016, 2018-2020 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016, 2018-2020, 2022 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <htslib/kstring.h>
#include "samtools.h"
-#define DEFAULT_FASTA_LINE_LEN 60
+// Negative indicates the same as input data
+#define DEFAULT_FASTA_LINE_LEN -60
+
+#ifndef ABS
+# define ABS(x) ((x)>=0?(x):-(x))
+#endif
static unsigned char comp_base[256] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
static int write_line(faidx_t *faid, FILE *file, const char *line, const char *name,
- const int ignore, const int length, const hts_pos_t seq_len) {
+ const int ignore, const hts_pos_t length, const hts_pos_t seq_len) {
int id;
hts_pos_t beg, end;
} else if (seq_len == 0) {
fprintf(samtools_stderr, "[faidx] Zero length sequence: %s\n", name);
} else if (fai_parse_region(faid, name, &id, &beg, &end, 0)
- && (end < INT_MAX) && (seq_len != end - beg)) {
+ && (end < HTS_POS_MAX) && (seq_len != end - beg)) {
fprintf(samtools_stderr, "[faidx] Truncated sequence: %s\n", name);
}
static int write_output(faidx_t *faid, FILE *file, const char *name, const int ignore,
- const int length, const int rev,
+ const hts_pos_t length, const int rev,
const char *pos_strand_name, const char *neg_strand_name,
enum fai_format_options format) {
- hts_pos_t seq_len;
+ hts_pos_t seq_len, wrap_len = length;
+ if (wrap_len < 0)
+ wrap_len = fai_line_length(faid, name);
+ if (wrap_len <= 0)
+ wrap_len = HTS_POS_MAX;
char *seq = fai_fetch64(faid, name, &seq_len);
if (format == FAI_FASTA) {
reverse_complement(seq, seq_len);
}
- if (write_line(faid, file, seq, name, ignore, length, seq_len)
+ if (write_line(faid, file, seq, name, ignore, wrap_len, seq_len)
== EXIT_FAILURE) {
free(seq);
return EXIT_FAILURE;
reverse(qual, seq_len);
}
- if (write_line(faid, file, qual, name, ignore, length, seq_len)
+ if (write_line(faid, file, qual, name, ignore, wrap_len, seq_len)
== EXIT_FAILURE) {
free(qual);
return EXIT_FAILURE;
static int read_regions_from_file(faidx_t *faid, hFILE *in_file, FILE *file, const int ignore,
- const int length, const int rev,
+ const hts_pos_t length, const int rev,
const char *pos_strand_name,
const char *neg_strand_name,
enum fai_format_options format) {
int faidx_core(int argc, char *argv[], enum fai_format_options format)
{
int c, ignore_error = 0, rev = 0;
- int line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */
+ hts_pos_t line_len = DEFAULT_FASTA_LINE_LEN ;/* fasta line len */
char* output_file = NULL; /* output file (default is samtools_stdout ) */
char *region_file = NULL; // list of regions from file, one per line
char *pos_strand_name = ""; // Extension to add to name for +ve strand
while ((c = getopt_long(argc, argv, "ho:n:cr:fi", lopts, NULL)) >= 0) {
switch (c) {
case 'o': output_file = optarg; break;
- case 'n': line_len = atoi(optarg);
- if(line_len<1) {
- fprintf(samtools_stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,DEFAULT_FASTA_LINE_LEN);
- line_len= DEFAULT_FASTA_LINE_LEN ;
- }
+ case 'n': line_len = strtol(optarg, NULL, 10);
+ if (line_len < 0) {
+ fprintf(samtools_stderr,"[faidx] bad line length '%s', using default:%d\n",optarg,ABS(DEFAULT_FASTA_LINE_LEN));
+ line_len= ABS(DEFAULT_FASTA_LINE_LEN);
+ }
break;
case 'c': ignore_error = 1; break;
case 'r': region_file = optarg; break;
--- /dev/null
+/* reset.c -- removes aligner updates and reference data from input sam /
+ bam / cram file and makes read data raw for new processing
+
+ Copyright (C) 2022, 2023 Genome Research Ltd.
+
+ Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+#include "samtools.h"
+#include "htslib/sam.h"
+#include "sam_opts.h"
+#include "htslib/thread_pool.h"
+#include "htslib/khash.h"
+#include "sam_utils.h"
+#include <unistd.h>
+
+#define TAGNUM(X) (((X)[0] << 8) | (X)[1]) //to create key for aux tags, like type key in htslib
+#define LONG_OPT(X) (128 + (X)) //to handle long and short options with same char
+
+typedef struct conf_data
+{
+ int keepRGs; //RG line handling
+ int noPGentry; //PG line for reset op or not
+ auxhash_t aux_keep; //SET that holds the aux tags to be retained
+ auxhash_t aux_remove; //SET that holds the aux tags to be removed
+ char *pgid; //PG id onwards which to be removed
+} conf_data;
+
+/// usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void usage(FILE *fp)
+{
+ fprintf(fp, "Usage: samtools reset [options]\n\
+ -o FILE Output file\n\
+ -x, --remove-tag STR\n\
+ Aux tags to be removed\n\
+ --keep-tag STR\n\
+ Aux tags to be retained. Equivalent to -x ^STR\n\
+ --reject-PG ID\n\
+ Removes PG line with ID matching to input and succeeding PG lines\n\
+ --no-RG To have RG lines or not\n\
+ --no-PG To have PG entry or not for reset operation\n");
+
+ sam_global_opt_help(fp, "--O--@--");
+ return;
+}
+
+/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set
+/** @param bamdata - pointer to the bamdata from which needs the filtering
+ * @param config - pointer to conf_data
+returns nothing
+*/
+void removeauxtags(bam1_t *bamdata, conf_data *config)
+{
+ uint8_t *auxdata = NULL;
+ const char *tag = NULL, rg[] = "RG";
+ khint_t iter = 0;
+ int ret = 0;
+
+ if (!bamdata || !config || (!config->aux_keep && !config->aux_remove && config->keepRGs))
+ return;
+
+ //remove RG tags from bamdata if keepRG is false
+ if (!config->keepRGs) {
+ if (!config->aux_keep && !config->aux_remove) {
+ //none of aux tag filter in use, create remove filter
+ config->aux_remove = kh_init(aux_exists);
+ }
+
+ if (config->aux_keep) {
+ //keep set in use, remove RG if present
+ iter = kh_get(aux_exists, config->aux_keep, TAGNUM(rg));
+ if (iter != kh_end(config->aux_keep)) {
+ kh_del(aux_exists, config->aux_keep, iter);
+ }
+ }
+ if (config->aux_remove) {
+ //remove set in use, add RG if not present
+ iter = kh_get(aux_exists, config->aux_remove, TAGNUM(rg));
+ if (iter == kh_end(config->aux_remove)) {
+ kh_put(aux_exists, config->aux_remove, TAGNUM(rg), &ret);
+ }
+ }
+ }
+
+ for (auxdata = bam_aux_first(bamdata); auxdata; ) {
+ tag = bam_aux_tag(auxdata);
+ if (config->aux_keep) { //keep option or remove option with ^ in use
+ iter = kh_get(aux_exists, config->aux_keep, TAGNUM(tag));
+ if (iter == kh_end(config->aux_keep)) { //not present in keep, remove
+ auxdata = bam_aux_remove(bamdata, auxdata);
+ }
+ else { //present, keep
+ auxdata = bam_aux_next(bamdata, auxdata);
+ }
+ }
+ else if (config->aux_remove) { //remove option in use
+ iter = kh_get(aux_exists, config->aux_remove, TAGNUM(tag));
+ if (iter != kh_end(config->aux_remove)) { //present in remove, remove
+ auxdata = bam_aux_remove(bamdata, auxdata);
+ }
+ else { //not present, keep
+ auxdata = bam_aux_next(bamdata, auxdata);
+ }
+ }
+ //else impossible
+ }
+}
+
+/// getRGlines - add RG lines from input header to output header
+/** @param in_samhdr - pointer to input sam header data
+ * @param out_samhdr - pointer to output sam header data
+returns 1 on failure 0 on success
+*/
+int getRGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr)
+{
+ kstring_t line = KS_INITIALIZE;
+ int i = 0, ret = 0, count = 0;
+ const char rg[] = "RG";
+
+ if (!in_samhdr || !out_samhdr) {
+ fprintf(stderr, "Invalid parameters in getRGlines!\n");
+ return 1;
+ }
+
+ if (-1 == (count = sam_hdr_count_lines(in_samhdr, rg))) {
+ fprintf(stderr, "Failed to get RG count!\n");
+ return 1;
+ }
+
+ for (i = 0; i < count; ++i)
+ {
+ ks_clear(&line);
+ if (sam_hdr_find_line_pos(in_samhdr, rg, i, &line)) {
+ fprintf(stderr, "Failed to get RG data!\n");
+ ret = 1;
+ break;
+ }
+ if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) {
+ fprintf(stderr, "Failed to add RG data!\n");
+ ret = 1;
+ break;
+ }
+ }
+ ks_free(&line);
+
+ return ret;
+}
+
+/// getPGlines - add PG lines from input header to output header based on user option
+/** @param in_samhdr - pointer to input sam header data
+ * @param out_samhdr - pointer to output sam header data
+ * @param config - pointer to internal configuration data
+ * @param argdump - string containing dump of command line invocation
+returns 1 on failure 0 on success
+*/
+int getPGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr, conf_data *config, const char *argdump)
+{
+ kstring_t line = KS_INITIALIZE, id = KS_INITIALIZE;
+ int i = 0, ret = 0, count = 0;
+ const char pg[] = "PG";
+
+ if (!in_samhdr || !out_samhdr || !config) {
+ fprintf(stderr, "Invalid parameters in getPGlines!\n");
+ return 1;
+ }
+
+ if (-1 == (count = sam_hdr_count_lines(in_samhdr, pg))) {
+ fprintf(stderr, "Failed to get PG count!\n");
+ return 1;
+ }
+
+ if (config->pgid && config->pgid[0]) { //when reject-PG is given, and is not empty, remove given pg onwards
+ for (i = 0; i < count; ++i) {
+ if (sam_hdr_find_tag_pos(in_samhdr, pg, i, "ID", &id)) {
+ fprintf(stderr, "Failed to get PG entry fields for line %d!\n", i + 1);
+ break;
+ }
+
+ if (!strcmp(id.s, config->pgid))
+ break;
+
+ //either current PG is prior to rejected one or all PGs are in, get PG line and add
+ ks_clear(&line);
+ if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) {
+ fprintf(stderr, "Failed to get PG data at %d!\n", i + 1);
+ ret = 1;
+ break;
+ }
+
+ //add to output
+ if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) {
+ fprintf(stderr, "Failed to add PG data!\n");
+ ret = 1;
+ break;
+ }
+ }
+ }
+ else { //keep all
+ for (i = 0; i < count; ++i) {
+ if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) {
+ fprintf(stderr, "Failed to get PG data at %d!\n", i + 1);
+ ret = 1;
+ break;
+ }
+ //line has the required PG data
+ if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) {
+ fprintf(stderr, "Failed to add PG data!\n");
+ ret = 1;
+ break;
+ }
+ }
+ }
+
+ if (!ret && !config->noPGentry) {
+ //add PG entry with reset command
+ if (-1 == (ret = sam_hdr_add_pg(out_samhdr, "samtools", "CL", argdump, NULL))) {
+ fprintf(stderr, "Failed to set PG entry!\n");
+ }
+ }
+ ks_free(&line);
+ ks_free(&id);
+
+ return ret;
+}
+
+/// reset - do the reset of data and create output; create output header with required rg/pg data, add bamdata with flags set to unmapped, pair info and orientation reset,
+// reerse and complement alignment if required
+/** @param infile - input samfile pointer
+ * @param outfile - output sam file pointer
+ * @param config - pointer to internal configuration data
+ * @param args - string containing dump of command line invocation
+returns 1 on failure 0 on success
+*/
+int reset(samFile *infile, samFile *outfile, conf_data *config, char *args)
+{
+ sam_hdr_t *in_samhdr = NULL, *out_samhdr = NULL;
+ int ret = EXIT_FAILURE, ret_r = 0, ret_w = 0, i = 0;
+ bam1_t *bamdata = NULL, *outdata = NULL;
+ kstring_t querydata = KS_INITIALIZE, qualdata = KS_INITIALIZE;
+ char *sp = NULL, *qp = NULL;
+ uint8_t *bamquery = NULL, *bamqual = NULL;
+
+ if (!infile || !outfile) {
+ fprintf(stderr, "Invalid parameters in reset!\n");
+ goto error;
+ }
+
+ //read input header
+ in_samhdr = sam_hdr_read(infile);
+ if (!in_samhdr)
+ {
+ fprintf(stderr, "Failed to read header from file!\n");
+ goto error;
+ }
+ //create output header
+ if (!(out_samhdr = sam_hdr_init()))
+ {
+ fprintf(stderr, "Failed to create output header!\n");
+ goto error;
+ }
+
+ //add version to output header
+ if (-1 == sam_hdr_add_line(out_samhdr,"HD", "VN", SAM_FORMAT_VERSION, NULL)) {
+ fprintf(stderr, "Failed to set header data!\n");
+ goto error;
+ }
+ //add RG / PG lines if configured
+ if ((config->keepRGs && getRGlines(in_samhdr, out_samhdr)) ||
+ getPGlines(in_samhdr, out_samhdr, config, args)) {
+ goto error;
+ }
+
+ //write output header
+ if (sam_hdr_write(outfile, out_samhdr)) {
+ print_error_errno("reset", "Output header write failed (%d)!\n", errno);
+ goto error;
+ }
+
+ bamdata = bam_init1(); //input bam
+ outdata = bam_init1(); //output bam
+ if (!bamdata || !outdata)
+ {
+ fprintf(stderr, "Failed to allocate data memory!\n");
+ goto error;
+ }
+
+ errno = 0; i = 0;
+ sp = NULL; qp = NULL;
+ bamquery = NULL; bamqual = NULL;
+
+ //get bam data, make updates and dump to output
+ while (0 <= (ret_r = sam_read1(infile, in_samhdr, bamdata)))
+ {
+ sp = NULL; qp = NULL;
+ bamquery = NULL; bamqual = NULL;
+
+ // read data
+ if (bamdata->core.flag & BAM_FSECONDARY || bamdata->core.flag & BAM_FSUPPLEMENTARY) {
+ continue;
+ }
+
+ //update flags
+ uint16_t flags = bamdata->core.flag & ~BAM_FPROPER_PAIR; //reset pair info
+ flags |= BAM_FUNMAP; //mark as unmapped
+ if (bamdata->core.flag & BAM_FPAIRED) {
+ flags |= BAM_FMUNMAP; //mark mate as unmapped, if it was a pair
+ }
+ flags &= ~BAM_FMREVERSE; //reset mate orientation
+
+ if (0 > ks_resize(&querydata, bamdata->core.l_qseq) ||
+ 0 > ks_resize(&qualdata, bamdata->core.l_qseq)) {
+ fprintf(stderr, "Failed to get allocate memory!\n");
+ ret_r = -4;
+ break;
+ }
+ ks_clear(&querydata);
+ ks_clear(&qualdata);
+
+ sp = ks_str(&querydata);
+ qp = ks_str(&qualdata);
+ bamquery = bam_get_seq(bamdata);
+ bamqual = bam_get_qual(bamdata);
+ if (bamdata->core.flag & BAM_FREVERSE) {
+ //sequence data ordered as reverse complemented, reorder/complement sequence and quality data as read and clear the flag
+ for (i = bamdata->core.l_qseq - 1; i >= 0; --i) {
+ *sp++ = "=TGKCYSBAWRDMHVN"[bam_seqi(bamquery, i)];
+ *qp++ = bamqual[i];
+ }
+ flags &= ~BAM_FREVERSE; //reset flag as well
+ }
+ else {
+ //data in read order itself
+ for (i = 0; i < bamdata->core.l_qseq ; ++i) {
+ *sp++ = seq_nt16_str[bam_seqi(bamquery, i)];
+ }
+ memcpy(qp, bam_get_qual(bamdata), bamdata->core.l_qseq);
+ }
+
+ removeauxtags(bamdata, config);
+ if (0 > (ret_w = bam_set1(outdata, bamdata->core.l_qname - bamdata->core.l_extranul - 1, bam_get_qname(bamdata), flags, -1, -1, 0, 0, NULL, -1, -1, 0, bamdata->core.l_qseq, querydata.s, qualdata.s, bam_get_l_aux(bamdata)))) {
+ print_error_errno("reset", "Failed to set output data (%d)!\n", errno);
+ break;
+ }
+
+ memcpy(bam_get_aux(outdata), bam_get_aux(bamdata), bam_get_l_aux(bamdata));
+ outdata->l_data += bam_get_l_aux(bamdata);
+
+ errno = 0;
+ //write bam data to output
+ if (0 > (ret_w = sam_write1(outfile, out_samhdr, outdata)))
+ {
+ print_error_errno("reset", "Failed to write output data (%d)!\n", errno);
+ break;
+ }
+ // wrote the data, continue read/write cycle
+ errno = 0;
+ }
+
+ if (-1 > ret_r || 0 > ret_w) {
+ //some error
+ fprintf(stderr, "Error during %s!\n", (-1 > ret_r)? "read" : "write");
+ }
+ else {
+ // no error!
+ ret = EXIT_SUCCESS;
+ }
+
+error:
+ // clean up and return result
+ if (in_samhdr)
+ sam_hdr_destroy(in_samhdr);
+ if (out_samhdr)
+ sam_hdr_destroy(out_samhdr);
+
+ if (bamdata)
+ bam_destroy1(bamdata);
+ if (outdata)
+ bam_destroy1(outdata);
+
+ if (qualdata.s)
+ ks_free(&qualdata);
+ if (querydata.s)
+ ks_free(&querydata);
+ return ret;
+}
+
+/// cleanup - free up allocations made
+/** @param config - pointer to internal configuration data
+returns nothing
+*/
+void cleanup(conf_data *config)
+{
+ if (config->aux_keep) {
+ kh_destroy(aux_exists, config->aux_keep);
+ config->aux_keep = NULL;
+ }
+ if (config->aux_remove) {
+ kh_destroy(aux_exists, config->aux_remove);
+ config->aux_remove = NULL;
+ }
+}
+
+/// main_reset - starts the reset of data
+/** @param argc - count of arguments
+ * @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main_reset(int argc, char *argv[])
+{
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', '-', 'O', '-', '-', '@'), //let output format and thread count be given by user - long options
+ {"keep-tag", required_argument, NULL, LONG_OPT('x')}, //aux tags to be retained, supports ^ STR
+ {"remove-tag", required_argument, NULL, 'x'}, //aux tags to be removed
+ {"no-RG", no_argument, NULL, 1}, //no RG lines in output, default is to keep them
+ //reject PG lines from input, default is to keep them (i.e. option not given); without optional filename, all PGs removed and those given in file are filtered when optional filename is given
+ {"reject-PG", required_argument, NULL, 'p'}, //reject entries from this PG onwards
+ {"no-PG", no_argument, NULL, 2}, //do not add PG entry for reset operation, default is to add it
+ {NULL, 0, NULL, 0}
+ };
+ samFile *infile = NULL, *outfile = NULL;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool tpool = {NULL, 0};
+ const char *inname = NULL, *outname = NULL;
+ int c = 0, ret = EXIT_FAILURE;
+ char outmode[4] = "w", *args = NULL;
+ conf_data resetconf = {1, 0, NULL, NULL, NULL}; //keep RGs and PGs by default
+
+
+ //samtools reset -o outfile -x/--remove-tag ... --keep-tag ... --threads=n --output-fmt=fmt --no-RG --reject-PG pgid --no-PG [<infile>]
+ while ((c = getopt_long(argc, argv, "o:@:x:O:", lopts, NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 1: //--no-RG
+ if (!resetconf.keepRGs) {
+ usage(stderr); //already given!
+ goto exit;
+ }
+ resetconf.keepRGs = 0;
+ break;
+ case 2: //--no-PG
+ if (resetconf.noPGentry) {
+ usage(stderr); //already given!
+ goto exit;
+ }
+ resetconf.noPGentry = 1;
+ break;
+ case 'p': //--reject-PG=<id>
+ if (resetconf.pgid) {
+ usage(stderr); //already given!
+ goto exit;
+ }
+ resetconf.pgid = optarg;
+ break;
+ case 'o': //output file name
+ if (outname) { //already given!
+ usage(stderr);
+ goto exit;
+
+ }
+ outname = optarg;
+ break;
+ case 'x': //remove aux tag
+ if (*optarg == '^') { //remove all except given ones!
+ if (parse_aux_list(&resetconf.aux_keep, optarg+1, "main_reset")) {
+ usage(stderr);
+ goto exit;
+ }
+ }
+ else { //remove given ones
+ if (parse_aux_list(&resetconf.aux_remove, optarg, "main_reset")) {
+ usage(stderr);
+ goto exit;
+ }
+ }
+ break;
+ case LONG_OPT('x'): //keep aux tags
+ if (parse_aux_list(&resetconf.aux_keep, optarg, "main_reset")) {
+ usage(stderr);
+ goto exit;
+ }
+ break;
+ // handle standard samtool options like thread count, verbosity...
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0)
+ break;
+ // else fall-through
+ // couldn't parse or unknown options, show usage!
+ case '?': //unknown options found!
+ usage(stderr);
+ goto exit;
+ break;
+ }
+ }
+
+ if (argc == 1 && isatty(STDIN_FILENO)) {
+ //no args and input is stdin -- it is the usage check
+ usage(stdout);
+ ret = EXIT_SUCCESS;
+ goto exit;
+ }
+ //else have other args or input from redirection/pipe/other device -- validate and work
+
+ if (!outname)
+ outname = "-";
+
+ //check and fail if unnecessary parameters are given
+ c = argc - optind;
+ if (c > 1) {
+ usage(stderr);
+ goto exit;
+ }
+
+ if (c == 1) {
+ inname = argv[optind];
+ }
+ else {
+ inname = "-";
+ }
+
+ //set output file format based on name
+ sam_open_mode(outmode + 1, outname, NULL);
+
+ //open input and output files
+ infile = sam_open(inname, "r");
+ outfile = sam_open_format(outname, outmode, &ga.out);
+ if (!infile || !outfile) {
+ fprintf(stderr, "Could not open %s%s%s\n", !infile ? inname : "", (!infile && !outfile)? ", " : "", !outfile ? outname : "");
+ goto exit;
+ }
+
+ // set the thread count if given as argument
+ if (ga.nthreads > 0)
+ {
+ if (!(tpool.pool = hts_tpool_init(ga.nthreads)))
+ {
+ fprintf(stderr, "\nFailed to setup thread pool\n");
+ goto exit;
+ }
+
+ hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool);
+ hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool);
+ }
+
+ args = stringify_argv(argc + 1, argv - 1); //to dump invocation in PG line
+
+ //do the reset!
+ ret = reset(infile, outfile, &resetconf, args);
+
+exit:
+ if (args)
+ free(args);
+ if (infile)
+ sam_close(infile);
+ if (outfile)
+ sam_close(outfile);
+ if (tpool.pool)
+ hts_tpool_destroy(tpool.pool);
+ cleanup(&resetconf);
+ sam_global_args_free(&ga);
+
+ return ret;
+}
--- /dev/null
+#include "samtools.pysam.h"
+
+/* reset.c -- removes aligner updates and reference data from input sam /
+ bam / cram file and makes read data raw for new processing
+
+ Copyright (C) 2022, 2023 Genome Research Ltd.
+
+ Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE
+
+*/
+
+#include "samtools.h"
+#include "htslib/sam.h"
+#include "sam_opts.h"
+#include "htslib/thread_pool.h"
+#include "htslib/khash.h"
+#include "sam_utils.h"
+#include <unistd.h>
+
+#define TAGNUM(X) (((X)[0] << 8) | (X)[1]) //to create key for aux tags, like type key in htslib
+#define LONG_OPT(X) (128 + (X)) //to handle long and short options with same char
+
+typedef struct conf_data
+{
+ int keepRGs; //RG line handling
+ int noPGentry; //PG line for reset op or not
+ auxhash_t aux_keep; //SET that holds the aux tags to be retained
+ auxhash_t aux_remove; //SET that holds the aux tags to be removed
+ char *pgid; //PG id onwards which to be removed
+} conf_data;
+
+/// usage - print the usage
+/** @param fp pointer to the file / terminal to which usage to be dumped
+returns nothing
+*/
+static void usage(FILE *fp)
+{
+ fprintf(fp, "Usage: samtools reset [options]\n\
+ -o FILE Output file\n\
+ -x, --remove-tag STR\n\
+ Aux tags to be removed\n\
+ --keep-tag STR\n\
+ Aux tags to be retained. Equivalent to -x ^STR\n\
+ --reject-PG ID\n\
+ Removes PG line with ID matching to input and succeeding PG lines\n\
+ --no-RG To have RG lines or not\n\
+ --no-PG To have PG entry or not for reset operation\n");
+
+ sam_global_opt_help(fp, "--O--@--");
+ return;
+}
+
+/// removeauxtags - remove aux tags in bam data which are not present in acceptable tag set
+/** @param bamdata - pointer to the bamdata from which needs the filtering
+ * @param config - pointer to conf_data
+returns nothing
+*/
+void removeauxtags(bam1_t *bamdata, conf_data *config)
+{
+ uint8_t *auxdata = NULL;
+ const char *tag = NULL, rg[] = "RG";
+ khint_t iter = 0;
+ int ret = 0;
+
+ if (!bamdata || !config || (!config->aux_keep && !config->aux_remove && config->keepRGs))
+ return;
+
+ //remove RG tags from bamdata if keepRG is false
+ if (!config->keepRGs) {
+ if (!config->aux_keep && !config->aux_remove) {
+ //none of aux tag filter in use, create remove filter
+ config->aux_remove = kh_init(aux_exists);
+ }
+
+ if (config->aux_keep) {
+ //keep set in use, remove RG if present
+ iter = kh_get(aux_exists, config->aux_keep, TAGNUM(rg));
+ if (iter != kh_end(config->aux_keep)) {
+ kh_del(aux_exists, config->aux_keep, iter);
+ }
+ }
+ if (config->aux_remove) {
+ //remove set in use, add RG if not present
+ iter = kh_get(aux_exists, config->aux_remove, TAGNUM(rg));
+ if (iter == kh_end(config->aux_remove)) {
+ kh_put(aux_exists, config->aux_remove, TAGNUM(rg), &ret);
+ }
+ }
+ }
+
+ for (auxdata = bam_aux_first(bamdata); auxdata; ) {
+ tag = bam_aux_tag(auxdata);
+ if (config->aux_keep) { //keep option or remove option with ^ in use
+ iter = kh_get(aux_exists, config->aux_keep, TAGNUM(tag));
+ if (iter == kh_end(config->aux_keep)) { //not present in keep, remove
+ auxdata = bam_aux_remove(bamdata, auxdata);
+ }
+ else { //present, keep
+ auxdata = bam_aux_next(bamdata, auxdata);
+ }
+ }
+ else if (config->aux_remove) { //remove option in use
+ iter = kh_get(aux_exists, config->aux_remove, TAGNUM(tag));
+ if (iter != kh_end(config->aux_remove)) { //present in remove, remove
+ auxdata = bam_aux_remove(bamdata, auxdata);
+ }
+ else { //not present, keep
+ auxdata = bam_aux_next(bamdata, auxdata);
+ }
+ }
+ //else impossible
+ }
+}
+
+/// getRGlines - add RG lines from input header to output header
+/** @param in_samhdr - pointer to input sam header data
+ * @param out_samhdr - pointer to output sam header data
+returns 1 on failure 0 on success
+*/
+int getRGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr)
+{
+ kstring_t line = KS_INITIALIZE;
+ int i = 0, ret = 0, count = 0;
+ const char rg[] = "RG";
+
+ if (!in_samhdr || !out_samhdr) {
+ fprintf(samtools_stderr, "Invalid parameters in getRGlines!\n");
+ return 1;
+ }
+
+ if (-1 == (count = sam_hdr_count_lines(in_samhdr, rg))) {
+ fprintf(samtools_stderr, "Failed to get RG count!\n");
+ return 1;
+ }
+
+ for (i = 0; i < count; ++i)
+ {
+ ks_clear(&line);
+ if (sam_hdr_find_line_pos(in_samhdr, rg, i, &line)) {
+ fprintf(samtools_stderr, "Failed to get RG data!\n");
+ ret = 1;
+ break;
+ }
+ if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) {
+ fprintf(samtools_stderr, "Failed to add RG data!\n");
+ ret = 1;
+ break;
+ }
+ }
+ ks_free(&line);
+
+ return ret;
+}
+
+/// getPGlines - add PG lines from input header to output header based on user option
+/** @param in_samhdr - pointer to input sam header data
+ * @param out_samhdr - pointer to output sam header data
+ * @param config - pointer to internal configuration data
+ * @param argdump - string containing dump of command line invocation
+returns 1 on failure 0 on success
+*/
+int getPGlines(sam_hdr_t *in_samhdr, sam_hdr_t *out_samhdr, conf_data *config, const char *argdump)
+{
+ kstring_t line = KS_INITIALIZE, id = KS_INITIALIZE;
+ int i = 0, ret = 0, count = 0;
+ const char pg[] = "PG";
+
+ if (!in_samhdr || !out_samhdr || !config) {
+ fprintf(samtools_stderr, "Invalid parameters in getPGlines!\n");
+ return 1;
+ }
+
+ if (-1 == (count = sam_hdr_count_lines(in_samhdr, pg))) {
+ fprintf(samtools_stderr, "Failed to get PG count!\n");
+ return 1;
+ }
+
+ if (config->pgid && config->pgid[0]) { //when reject-PG is given, and is not empty, remove given pg onwards
+ for (i = 0; i < count; ++i) {
+ if (sam_hdr_find_tag_pos(in_samhdr, pg, i, "ID", &id)) {
+ fprintf(samtools_stderr, "Failed to get PG entry fields for line %d!\n", i + 1);
+ break;
+ }
+
+ if (!strcmp(id.s, config->pgid))
+ break;
+
+ //either current PG is prior to rejected one or all PGs are in, get PG line and add
+ ks_clear(&line);
+ if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) {
+ fprintf(samtools_stderr, "Failed to get PG data at %d!\n", i + 1);
+ ret = 1;
+ break;
+ }
+
+ //add to output
+ if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) {
+ fprintf(samtools_stderr, "Failed to add PG data!\n");
+ ret = 1;
+ break;
+ }
+ }
+ }
+ else { //keep all
+ for (i = 0; i < count; ++i) {
+ if (sam_hdr_find_line_pos(in_samhdr, "PG", i, &line)) {
+ fprintf(samtools_stderr, "Failed to get PG data at %d!\n", i + 1);
+ ret = 1;
+ break;
+ }
+ //line has the required PG data
+ if (sam_hdr_add_lines(out_samhdr, line.s, line.l)) {
+ fprintf(samtools_stderr, "Failed to add PG data!\n");
+ ret = 1;
+ break;
+ }
+ }
+ }
+
+ if (!ret && !config->noPGentry) {
+ //add PG entry with reset command
+ if (-1 == (ret = sam_hdr_add_pg(out_samhdr, "samtools", "CL", argdump, NULL))) {
+ fprintf(samtools_stderr, "Failed to set PG entry!\n");
+ }
+ }
+ ks_free(&line);
+ ks_free(&id);
+
+ return ret;
+}
+
+/// reset - do the reset of data and create output; create output header with required rg/pg data, add bamdata with flags set to unmapped, pair info and orientation reset,
+// reerse and complement alignment if required
+/** @param infile - input samfile pointer
+ * @param outfile - output sam file pointer
+ * @param config - pointer to internal configuration data
+ * @param args - string containing dump of command line invocation
+returns 1 on failure 0 on success
+*/
+int reset(samFile *infile, samFile *outfile, conf_data *config, char *args)
+{
+ sam_hdr_t *in_samhdr = NULL, *out_samhdr = NULL;
+ int ret = EXIT_FAILURE, ret_r = 0, ret_w = 0, i = 0;
+ bam1_t *bamdata = NULL, *outdata = NULL;
+ kstring_t querydata = KS_INITIALIZE, qualdata = KS_INITIALIZE;
+ char *sp = NULL, *qp = NULL;
+ uint8_t *bamquery = NULL, *bamqual = NULL;
+
+ if (!infile || !outfile) {
+ fprintf(samtools_stderr, "Invalid parameters in reset!\n");
+ goto error;
+ }
+
+ //read input header
+ in_samhdr = sam_hdr_read(infile);
+ if (!in_samhdr)
+ {
+ fprintf(samtools_stderr, "Failed to read header from file!\n");
+ goto error;
+ }
+ //create output header
+ if (!(out_samhdr = sam_hdr_init()))
+ {
+ fprintf(samtools_stderr, "Failed to create output header!\n");
+ goto error;
+ }
+
+ //add version to output header
+ if (-1 == sam_hdr_add_line(out_samhdr,"HD", "VN", SAM_FORMAT_VERSION, NULL)) {
+ fprintf(samtools_stderr, "Failed to set header data!\n");
+ goto error;
+ }
+ //add RG / PG lines if configured
+ if ((config->keepRGs && getRGlines(in_samhdr, out_samhdr)) ||
+ getPGlines(in_samhdr, out_samhdr, config, args)) {
+ goto error;
+ }
+
+ //write output header
+ if (sam_hdr_write(outfile, out_samhdr)) {
+ print_error_errno("reset", "Output header write failed (%d)!\n", errno);
+ goto error;
+ }
+
+ bamdata = bam_init1(); //input bam
+ outdata = bam_init1(); //output bam
+ if (!bamdata || !outdata)
+ {
+ fprintf(samtools_stderr, "Failed to allocate data memory!\n");
+ goto error;
+ }
+
+ errno = 0; i = 0;
+ sp = NULL; qp = NULL;
+ bamquery = NULL; bamqual = NULL;
+
+ //get bam data, make updates and dump to output
+ while (0 <= (ret_r = sam_read1(infile, in_samhdr, bamdata)))
+ {
+ sp = NULL; qp = NULL;
+ bamquery = NULL; bamqual = NULL;
+
+ // read data
+ if (bamdata->core.flag & BAM_FSECONDARY || bamdata->core.flag & BAM_FSUPPLEMENTARY) {
+ continue;
+ }
+
+ //update flags
+ uint16_t flags = bamdata->core.flag & ~BAM_FPROPER_PAIR; //reset pair info
+ flags |= BAM_FUNMAP; //mark as unmapped
+ if (bamdata->core.flag & BAM_FPAIRED) {
+ flags |= BAM_FMUNMAP; //mark mate as unmapped, if it was a pair
+ }
+ flags &= ~BAM_FMREVERSE; //reset mate orientation
+
+ if (0 > ks_resize(&querydata, bamdata->core.l_qseq) ||
+ 0 > ks_resize(&qualdata, bamdata->core.l_qseq)) {
+ fprintf(samtools_stderr, "Failed to get allocate memory!\n");
+ ret_r = -4;
+ break;
+ }
+ ks_clear(&querydata);
+ ks_clear(&qualdata);
+
+ sp = ks_str(&querydata);
+ qp = ks_str(&qualdata);
+ bamquery = bam_get_seq(bamdata);
+ bamqual = bam_get_qual(bamdata);
+ if (bamdata->core.flag & BAM_FREVERSE) {
+ //sequence data ordered as reverse complemented, reorder/complement sequence and quality data as read and clear the flag
+ for (i = bamdata->core.l_qseq - 1; i >= 0; --i) {
+ *sp++ = "=TGKCYSBAWRDMHVN"[bam_seqi(bamquery, i)];
+ *qp++ = bamqual[i];
+ }
+ flags &= ~BAM_FREVERSE; //reset flag as well
+ }
+ else {
+ //data in read order itself
+ for (i = 0; i < bamdata->core.l_qseq ; ++i) {
+ *sp++ = seq_nt16_str[bam_seqi(bamquery, i)];
+ }
+ memcpy(qp, bam_get_qual(bamdata), bamdata->core.l_qseq);
+ }
+
+ removeauxtags(bamdata, config);
+ if (0 > (ret_w = bam_set1(outdata, bamdata->core.l_qname - bamdata->core.l_extranul - 1, bam_get_qname(bamdata), flags, -1, -1, 0, 0, NULL, -1, -1, 0, bamdata->core.l_qseq, querydata.s, qualdata.s, bam_get_l_aux(bamdata)))) {
+ print_error_errno("reset", "Failed to set output data (%d)!\n", errno);
+ break;
+ }
+
+ memcpy(bam_get_aux(outdata), bam_get_aux(bamdata), bam_get_l_aux(bamdata));
+ outdata->l_data += bam_get_l_aux(bamdata);
+
+ errno = 0;
+ //write bam data to output
+ if (0 > (ret_w = sam_write1(outfile, out_samhdr, outdata)))
+ {
+ print_error_errno("reset", "Failed to write output data (%d)!\n", errno);
+ break;
+ }
+ // wrote the data, continue read/write cycle
+ errno = 0;
+ }
+
+ if (-1 > ret_r || 0 > ret_w) {
+ //some error
+ fprintf(samtools_stderr, "Error during %s!\n", (-1 > ret_r)? "read" : "write");
+ }
+ else {
+ // no error!
+ ret = EXIT_SUCCESS;
+ }
+
+error:
+ // clean up and return result
+ if (in_samhdr)
+ sam_hdr_destroy(in_samhdr);
+ if (out_samhdr)
+ sam_hdr_destroy(out_samhdr);
+
+ if (bamdata)
+ bam_destroy1(bamdata);
+ if (outdata)
+ bam_destroy1(outdata);
+
+ if (qualdata.s)
+ ks_free(&qualdata);
+ if (querydata.s)
+ ks_free(&querydata);
+ return ret;
+}
+
+/// cleanup - free up allocations made
+/** @param config - pointer to internal configuration data
+returns nothing
+*/
+void cleanup(conf_data *config)
+{
+ if (config->aux_keep) {
+ kh_destroy(aux_exists, config->aux_keep);
+ config->aux_keep = NULL;
+ }
+ if (config->aux_remove) {
+ kh_destroy(aux_exists, config->aux_remove);
+ config->aux_remove = NULL;
+ }
+}
+
+/// main_reset - starts the reset of data
+/** @param argc - count of arguments
+ * @param argv - pointer to array of arguments
+returns 1 on failure 0 on success
+*/
+int main_reset(int argc, char *argv[])
+{
+ static const struct option lopts[] = {
+ SAM_OPT_GLOBAL_OPTIONS('-', '-', 'O', '-', '-', '@'), //let output format and thread count be given by user - long options
+ {"keep-tag", required_argument, NULL, LONG_OPT('x')}, //aux tags to be retained, supports ^ STR
+ {"remove-tag", required_argument, NULL, 'x'}, //aux tags to be removed
+ {"no-RG", no_argument, NULL, 1}, //no RG lines in output, default is to keep them
+ //reject PG lines from input, default is to keep them (i.e. option not given); without optional filename, all PGs removed and those given in file are filtered when optional filename is given
+ {"reject-PG", required_argument, NULL, 'p'}, //reject entries from this PG onwards
+ {"no-PG", no_argument, NULL, 2}, //do not add PG entry for reset operation, default is to add it
+ {NULL, 0, NULL, 0}
+ };
+ samFile *infile = NULL, *outfile = NULL;
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool tpool = {NULL, 0};
+ const char *inname = NULL, *outname = NULL;
+ int c = 0, ret = EXIT_FAILURE;
+ char outmode[4] = "w", *args = NULL;
+ conf_data resetconf = {1, 0, NULL, NULL, NULL}; //keep RGs and PGs by default
+
+
+ //samtools reset -o outfile -x/--remove-tag ... --keep-tag ... --threads=n --output-fmt=fmt --no-RG --reject-PG pgid --no-PG [<infile>]
+ while ((c = getopt_long(argc, argv, "o:@:x:O:", lopts, NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 1: //--no-RG
+ if (!resetconf.keepRGs) {
+ usage(samtools_stderr); //already given!
+ goto exit;
+ }
+ resetconf.keepRGs = 0;
+ break;
+ case 2: //--no-PG
+ if (resetconf.noPGentry) {
+ usage(samtools_stderr); //already given!
+ goto exit;
+ }
+ resetconf.noPGentry = 1;
+ break;
+ case 'p': //--reject-PG=<id>
+ if (resetconf.pgid) {
+ usage(samtools_stderr); //already given!
+ goto exit;
+ }
+ resetconf.pgid = optarg;
+ break;
+ case 'o': //output file name
+ if (outname) { //already given!
+ usage(samtools_stderr);
+ goto exit;
+
+ }
+ outname = optarg;
+ break;
+ case 'x': //remove aux tag
+ if (*optarg == '^') { //remove all except given ones!
+ if (parse_aux_list(&resetconf.aux_keep, optarg+1, "main_reset")) {
+ usage(samtools_stderr);
+ goto exit;
+ }
+ }
+ else { //remove given ones
+ if (parse_aux_list(&resetconf.aux_remove, optarg, "main_reset")) {
+ usage(samtools_stderr);
+ goto exit;
+ }
+ }
+ break;
+ case LONG_OPT('x'): //keep aux tags
+ if (parse_aux_list(&resetconf.aux_keep, optarg, "main_reset")) {
+ usage(samtools_stderr);
+ goto exit;
+ }
+ break;
+ // handle standard samtool options like thread count, verbosity...
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0)
+ break;
+ // else fall-through
+ // couldn't parse or unknown options, show usage!
+ case '?': //unknown options found!
+ usage(samtools_stderr);
+ goto exit;
+ break;
+ }
+ }
+
+ if (argc == 1 && isatty(STDIN_FILENO)) {
+ //no args and input is stdin -- it is the usage check
+ usage(samtools_stdout);
+ ret = EXIT_SUCCESS;
+ goto exit;
+ }
+ //else have other args or input from redirection/pipe/other device -- validate and work
+
+ if (!outname)
+ outname = "-";
+
+ //check and fail if unnecessary parameters are given
+ c = argc - optind;
+ if (c > 1) {
+ usage(samtools_stderr);
+ goto exit;
+ }
+
+ if (c == 1) {
+ inname = argv[optind];
+ }
+ else {
+ inname = "-";
+ }
+
+ //set output file format based on name
+ sam_open_mode(outmode + 1, outname, NULL);
+
+ //open input and output files
+ infile = sam_open(inname, "r");
+ outfile = sam_open_format(outname, outmode, &ga.out);
+ if (!infile || !outfile) {
+ fprintf(samtools_stderr, "Could not open %s%s%s\n", !infile ? inname : "", (!infile && !outfile)? ", " : "", !outfile ? outname : "");
+ goto exit;
+ }
+
+ // set the thread count if given as argument
+ if (ga.nthreads > 0)
+ {
+ if (!(tpool.pool = hts_tpool_init(ga.nthreads)))
+ {
+ fprintf(samtools_stderr, "\nFailed to setup thread pool\n");
+ goto exit;
+ }
+
+ hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool);
+ hts_set_opt(outfile, HTS_OPT_THREAD_POOL, &tpool);
+ }
+
+ args = stringify_argv(argc + 1, argv - 1); //to dump invocation in PG line
+
+ //do the reset!
+ ret = reset(infile, outfile, &resetconf, args);
+
+exit:
+ if (args)
+ free(args);
+ if (infile)
+ sam_close(infile);
+ if (outfile)
+ sam_close(outfile);
+ if (tpool.pool)
+ hts_tpool_destroy(tpool.pool);
+ cleanup(&resetconf);
+ sam_global_args_free(&ga);
+
+ return ret;
+}
/* sam_utils.c -- various utilities internal to samtools.
- Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2014-2016, 2018, 2019, 2023 Genome Research Ltd.
Author: John Marshall <jm18@sanger.ac.uk>
#include <string.h>
#include <errno.h>
-#include "samtools.h"
+#include "sam_utils.h"
static htsFile *samtools_stdout = NULL;
return fn_idx;
}
+
+
+/// parse_aux_list - parses given string for aux tags which are ',' separated
+/** @param h - pointer to a SET holding aux tags
+ * @param optarg - string having the ',' separated aux tags
+ * @param msgheader - string to be used during error output as a header
+returns -1 on failure and 0 on success
+moved from sam_view.c to here for common usage at different source files
+*/
+int parse_aux_list(auxhash_t *h, char *optarg, const char *msgheader)
+{
+ if (!*h)
+ *h = kh_init(aux_exists);
+
+ while (strlen(optarg) >= 2) {
+ int x = optarg[0]<<8 | optarg[1];
+ int ret = 0;
+ kh_put(aux_exists, *h, x, &ret);
+ if (ret < 0) {
+ kh_destroy(aux_exists, *h);
+ *h = NULL;
+ return -1;
+ }
+
+ optarg += 2;
+ if (*optarg == ',') // allow white-space too for easy `cat file`?
+ optarg++;
+ else if (*optarg != 0)
+ break;
+ }
+
+ if (strlen(optarg) != 0) {
+ fprintf(stderr, "%s: Error parsing option, "
+ "auxiliary tags should be exactly two characters long.\n", msgheader ? msgheader : "");
+ kh_destroy(aux_exists, *h);
+ *h = NULL;
+ return -1;
+ }
+
+ return 0;
+}
/* sam_utils.c -- various utilities internal to samtools.
- Copyright (C) 2014-2016, 2018, 2019 Genome Research Ltd.
+ Copyright (C) 2014-2016, 2018, 2019, 2023 Genome Research Ltd.
Author: John Marshall <jm18@sanger.ac.uk>
#include <string.h>
#include <errno.h>
-#include "samtools.h"
+#include "sam_utils.h"
static htsFile *samtools_stdout_internal = NULL;
return fn_idx;
}
+
+
+/// parse_aux_list - parses given string for aux tags which are ',' separated
+/** @param h - pointer to a SET holding aux tags
+ * @param optarg - string having the ',' separated aux tags
+ * @param msgheader - string to be used during error output as a header
+returns -1 on failure and 0 on success
+moved from sam_view.c to here for common usage at different source files
+*/
+int parse_aux_list(auxhash_t *h, char *optarg, const char *msgheader)
+{
+ if (!*h)
+ *h = kh_init(aux_exists);
+
+ while (strlen(optarg) >= 2) {
+ int x = optarg[0]<<8 | optarg[1];
+ int ret = 0;
+ kh_put(aux_exists, *h, x, &ret);
+ if (ret < 0) {
+ kh_destroy(aux_exists, *h);
+ *h = NULL;
+ return -1;
+ }
+
+ optarg += 2;
+ if (*optarg == ',') // allow white-space too for easy `cat file`?
+ optarg++;
+ else if (*optarg != 0)
+ break;
+ }
+
+ if (strlen(optarg) != 0) {
+ fprintf(samtools_stderr, "%s: Error parsing option, "
+ "auxiliary tags should be exactly two characters long.\n", msgheader ? msgheader : "");
+ kh_destroy(aux_exists, *h);
+ *h = NULL;
+ return -1;
+ }
+
+ return 0;
+}
--- /dev/null
+/* sam_utils.c -- to hold utility functions and types
+
+ Copyright (C) 2023 Genome Research Ltd.
+
+ Author: Vasudeva Sarma <vasudeva.sarma@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef SAM_UTIL_H
+#define SAM_UTIL_H
+
+#include "htslib/khash.h"
+#include "htslib/sam.h"
+
+//this file may contain any utility functions and data types to be shared across
+
+/*below parse_aux_list and aux_exists are moved from sam_view.c to here for common
+ *usage at different source files
+ */
+
+KHASH_SET_INIT_INT(aux_exists) //SET data type to hold aux tags
+typedef khash_t(aux_exists) *auxhash_t;
+
+/// parse_aux_list - parses given string for aux tags which are ',' separated
+/** @param h - pointer to a SET holding aux tags
+ * @param optarg - string having the ',' separated aux tags
+ * @param msgheader - string to be used during error output as a header
+returns -1 on failure and 0 on success
+moved from sam_view.c to here for common usage at different source files
+*/
+int parse_aux_list(auxhash_t *h, char *optarg, const char *msgheader);
+
+
+// below utility function declarations moved from samtools.h to here and this header is included in samtools.h
+
+#define CHECK_PRINTF(fmt,args) HTS_FORMAT(HTS_PRINTF_FMT, (fmt), (args))
+
+void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3);
+void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3);
+
+void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp);
+
+/* Utility functions to register an output htsFile/samFile/vcfFile that
+ * might be stdout. If FNAME is "-" or NULL, records FP so that print_error()
+ * et al can automatically flush it before printing an error message.
+ */
+void autoflush_if_stdout(htsFile *fp, const char *fname);
+
+/* Call this before closing FP; check_sam_close() does this automatically.
+ */
+void release_autoflush(htsFile *fp);
+
+/*
+ * Utility function to add an index to a file we've opened for write.
+ * NB: Call this after writing the header and before writing sequences.
+ *
+ * The returned index filename should be freed by the caller, but only
+ * after sam_idx_save has been called.
+ *
+ * Returns index filename on success,
+ * NULL on failure.
+ */
+char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header);
+
+#endif //SAM_UTIL_H
+
+
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2022 Genome Research Ltd.
+ Copyright (C) 2009-2023 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "sam_opts.h"
#include "bam.h" // for bam_get_library and bam_remove_B
#include "bedidx.h"
+#include "sam_utils.h"
KHASH_SET_INIT_STR(str)
typedef khash_t(str) *strhash_t;
-KHASH_SET_INIT_INT(aux_exists)
-typedef khash_t(aux_exists) *auxhash_t;
-
// This structure contains the settings for a samview run
typedef struct samview_settings {
strhash_t rghash;
char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx;
int fetch_pairs, nreglist;
hts_reglist_t *reglist;
+ int sanitize;
+ int count_rf; // CRAM_OPT_REQUIRED_FIELDS for view -c
} samview_settings_t;
// Copied from htslib/sam.c.
b->core.flag &= ~settings->remove_flag;
}
-int parse_aux_list(auxhash_t *h, char *optarg) {
- if (!*h)
- *h = kh_init(aux_exists);
-
- while (strlen(optarg) >= 2) {
- int x = optarg[0]<<8 | optarg[1];
- int ret = 0;
- kh_put(aux_exists, *h, x, &ret);
- if (ret < 0)
- return -1;
-
- optarg += 2;
- if (*optarg == ',') // allow white-space too for easy `cat file`?
- optarg++;
- else if (*optarg != 0)
- break;
- }
-
- if (strlen(optarg) != 0) {
- fprintf(stderr, "main_samview: Error parsing option, "
- "auxiliary tags should be exactly two characters long.\n");
- return -1;
- }
-
- return 0;
-}
-
static int cmp_reglist_intervals(const void *aptr, const void *bptr)
{
hts_pair_pos_t *a = (hts_pair_pos_t*)aptr;
// Common code for processing and writing a record
static inline int process_one_record(samview_settings_t *conf, bam1_t *b,
int *write_error) {
+ if (conf->sanitize)
+ if (bam_sanitize(conf->header, b, conf->sanitize) < 0)
+ return -1;
+
if (!process_aln(conf->header, b, conf)) {
if (!conf->is_count) {
change_flag(b, conf);
return (l >= 4 && strcasecmp(fn + l-4, ".sam") == 0);
}
+static void aux_list_free(samview_settings_t *settings) {
+ if (settings->keep_tag)
+ kh_destroy(aux_exists, settings->keep_tag);
+ if (settings->remove_tag)
+ kh_destroy(aux_exists, settings->remove_tag);
+}
+
int main_samview(int argc, char *argv[])
{
samview_settings_t settings;
memset(&settings,0,sizeof(settings));
settings.subsam_frac = -1.0;
+ settings.count_rf = SAM_FLAG; // don't want 0, and this is quick
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
{"unoutput", required_argument, NULL, 'U'},
{"use-index", no_argument, NULL, 'M'},
{"with-header", no_argument, NULL, 'h'},
+ {"sanitize", required_argument, NULL, 'z'},
};
/* parse command-line options */
char *tmp;
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pP",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pPz:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
print_error("view", "Incorrect sampling argument \"%s\"", optarg);
goto view_end;
}
+ settings.count_rf |= SAM_QNAME;
break;
case LONGOPT('s'):
settings.subsam_frac = strtod(optarg, &tmp);
print_error("view", "Incorrect sampling argument \"%s\"", optarg);
goto view_end;
}
+ settings.count_rf |= SAM_QNAME;
break;
case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break;
- case 'm': settings.min_qlen = atoi(optarg); break;
+ case 'm':
+ settings.min_qlen = atoi(optarg);
+ settings.count_rf |= SAM_SEQ;
+ break;
case 'c': settings.is_count = 1; break;
case 'S': break;
case 'b': out_format = "b"; break;
case 'o': settings.fn_out = strdup(optarg); break;
case 'U': settings.fn_un_out = strdup(optarg); break;
case 'X': has_index_file = 1; break;
- case 'f': settings.flag_on |= bam_str2flag(optarg); break;
- case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+ case 'f':
+ settings.flag_on |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
+ case 'F':
+ settings.flag_off |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
case LONGOPT('g'):
- settings.flag_anyon |= bam_str2flag(optarg); break;
- case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
- case 'q': settings.min_mapQ = atoi(optarg); break;
+ settings.flag_anyon |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
+ case 'G':
+ settings.flag_alloff |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
+ case 'q':
+ settings.min_mapQ = atoi(optarg);
+ settings.count_rf |= SAM_MAPQ;
+ break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
- case 'l': settings.library = strdup(optarg); break;
+ case 'l':
+ settings.library = strdup(optarg);
+ settings.count_rf |= SAM_RGAUX;
+ break;
case 'p': settings.unmap = 1; break;
case 'P': settings.fetch_pairs = 1; settings.multi_region = 1; break;
+ case 'z':
+ if ((settings.sanitize = bam_sanitize_options(optarg)) < 0) {
+ ret = 1;
+ goto view_end;
+ }
+ break;
case LONGOPT('L'):
settings.multi_region = 1;
// fall through
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR;
break;
case 'r':
if (add_read_group_single("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_RGAUX;
break;
case 'R':
if (add_read_groups_file("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_RGAUX;
break;
case 'N':
if (add_read_names_file("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_QNAME;
break;
+
case 'd':
if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) {
- print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
+ print_error("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
+ // Some tag filtering affects other fields
+ if (memcmp(settings.tag, "NM", 2) == 0 ||
+ memcmp(settings.tag, "MD", 2) == 0)
+ settings.count_rf |= SAM_AUX | SAM_SEQ;
+ else if (memcmp(settings.tag, "RG", 2) == 0)
+ settings.count_rf |= SAM_RGAUX;
+ else
+ settings.count_rf |= SAM_AUX;
break;
+
case 'D':
// Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
// path translation as described at:
// http://www.mingw.org/wiki/Posix_path_conversion
if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) {
- print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
+ print_error("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
+ // Some tag filtering affects other fields
+ if (memcmp(settings.tag, "NM", 2) == 0 ||
+ memcmp(settings.tag, "MD", 2) == 0)
+ settings.count_rf |= SAM_AUX | SAM_SEQ;
+ else if (memcmp(settings.tag, "RG", 2) == 0)
+ settings.count_rf |= SAM_RGAUX;
+ else
+ settings.count_rf |= SAM_AUX;
break;
+
case LONGOPT('?'):
return usage(stdout, EXIT_SUCCESS, 1);
case '?':
print_error("main_samview", "Couldn't initialise filter");
return 1;
}
+ settings.count_rf = INT_MAX; // no way to know what we need
break;
case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
case 'x':
if (*optarg == '^') {
- if (parse_aux_list(&settings.keep_tag, optarg+1))
+ if (parse_aux_list(&settings.keep_tag, optarg+1, "main_samview")) {
+ aux_list_free(&settings);
return usage(stderr, EXIT_FAILURE, 0);
+ }
} else {
- if (parse_aux_list(&settings.remove_tag, optarg))
+ if (parse_aux_list(&settings.remove_tag, optarg, "main_samview")) {
+ aux_list_free(&settings);
return usage(stderr, EXIT_FAILURE, 0);
+ }
}
break;
case LONGOPT('x'):
- if (parse_aux_list(&settings.keep_tag, optarg))
+ if (parse_aux_list(&settings.keep_tag, optarg, "main_samview")) {
+ aux_list_free(&settings);
return usage(stderr, EXIT_FAILURE, 0);
+ }
break;
default:
settings.unmap = 0; // Not valid in counting mode
}
- if (ga.nthreads > 1) {
+ if (ga.nthreads > 0) {
if (!(p.pool = hts_tpool_init(ga.nthreads))) {
fprintf(stderr, "Error creating thread pool\n");
ret = 1;
// Initialize BAM/CRAM index
char **regs = NULL;
int nregs = 0;
- if ( has_index_file && optind < argc - 2 ) regs = &argv[optind+2], nregs = argc - optind - 2, settings.fn_idx_in = argv[optind+1];
- else if ( !has_index_file && optind < argc - 1 ) regs = &argv[optind+1], nregs = argc - optind - 1;
- else if ( has_index_file )
- {
+ if ( has_index_file && optind <= argc - 2 ) {
+ regs = optind < argc-2 ? &argv[optind+2] : NULL;
+ nregs = argc - optind - 2;
+ settings.fn_idx_in = argv[optind+1];
+ } else if (!has_index_file && optind < argc - 1 ) {
+ regs = &argv[optind+1];
+ nregs = argc - optind - 1;
+ } else if ( has_index_file && argc-optind < 2) {
print_error("view", "Incorrect number of arguments for -X option. Aborting.");
return 1;
}
+ if (regs)
+ settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR;
+
if ( settings.fn_idx_in || nregs || settings.multi_region )
{
settings.hts_idx = settings.fn_idx_in ? sam_index_load2(settings.in, settings.fn_in, settings.fn_idx_in) : sam_index_load(settings.in, settings.fn_in);
}
}
+ if (settings.is_count)
+ // Won't fail, but also wouldn't matter if it did
+ hts_set_opt(settings.in, CRAM_OPT_REQUIRED_FIELDS, settings.count_rf);
+
if ( settings.fetch_pairs )
{
hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs);
ret = iter ? multi_region_view(&settings, iter) : 1;
if (ret) goto view_end;
}
- else if ( !settings.hts_idx ) // stream through the entire file
- {
+ else if ( !settings.hts_idx || optind+1 >= argc-has_index_file ) {
+ // stream through the entire file
ret = stream_view(&settings);
if (ret) goto view_end;
} else { // retrieve alignments in specified regions
free(settings.fn_un_out_idx);
free(arg_list);
- if (settings.keep_tag)
- kh_destroy(aux_exists, settings.keep_tag);
- if (settings.remove_tag)
- kh_destroy(aux_exists, settings.remove_tag);
+ aux_list_free(&settings);
return ret;
}
" Comma-separated read tags to preserve (repeatable) [null].\n"
" Equivalent to \"-x ^STR\"\n"
" -B, --remove-B Collapse the backward CIGAR operation\n"
+" -z, --sanitize FLAGS Perform sanitity checking and fixing on records.\n"
+" FLAGS is comma separated (see manual). [off]\n"
"\n"
"General options:\n"
" -?, --help Print long help, including note about region specification\n"
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2022 Genome Research Ltd.
+ Copyright (C) 2009-2023 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "sam_opts.h"
#include "bam.h" // for bam_get_library and bam_remove_B
#include "bedidx.h"
+#include "sam_utils.h"
KHASH_SET_INIT_STR(str)
typedef khash_t(str) *strhash_t;
-KHASH_SET_INIT_INT(aux_exists)
-typedef khash_t(aux_exists) *auxhash_t;
-
// This structure contains the settings for a samview run
typedef struct samview_settings {
strhash_t rghash;
char *fn_in, *fn_idx_in, *fn_out, *fn_fai, *fn_un_out, *fn_out_idx, *fn_un_out_idx;
int fetch_pairs, nreglist;
hts_reglist_t *reglist;
+ int sanitize;
+ int count_rf; // CRAM_OPT_REQUIRED_FIELDS for view -c
} samview_settings_t;
// Copied from htslib/sam.c.
b->core.flag &= ~settings->remove_flag;
}
-int parse_aux_list(auxhash_t *h, char *optarg) {
- if (!*h)
- *h = kh_init(aux_exists);
-
- while (strlen(optarg) >= 2) {
- int x = optarg[0]<<8 | optarg[1];
- int ret = 0;
- kh_put(aux_exists, *h, x, &ret);
- if (ret < 0)
- return -1;
-
- optarg += 2;
- if (*optarg == ',') // allow white-space too for easy `cat file`?
- optarg++;
- else if (*optarg != 0)
- break;
- }
-
- if (strlen(optarg) != 0) {
- fprintf(samtools_stderr, "main_samview: Error parsing option, "
- "auxiliary tags should be exactly two characters long.\n");
- return -1;
- }
-
- return 0;
-}
-
static int cmp_reglist_intervals(const void *aptr, const void *bptr)
{
hts_pair_pos_t *a = (hts_pair_pos_t*)aptr;
// Common code for processing and writing a record
static inline int process_one_record(samview_settings_t *conf, bam1_t *b,
int *write_error) {
+ if (conf->sanitize)
+ if (bam_sanitize(conf->header, b, conf->sanitize) < 0)
+ return -1;
+
if (!process_aln(conf->header, b, conf)) {
if (!conf->is_count) {
change_flag(b, conf);
return (l >= 4 && strcasecmp(fn + l-4, ".sam") == 0);
}
+static void aux_list_free(samview_settings_t *settings) {
+ if (settings->keep_tag)
+ kh_destroy(aux_exists, settings->keep_tag);
+ if (settings->remove_tag)
+ kh_destroy(aux_exists, settings->remove_tag);
+}
+
int main_samview(int argc, char *argv[])
{
samview_settings_t settings;
memset(&settings,0,sizeof(settings));
settings.subsam_frac = -1.0;
+ settings.count_rf = SAM_FLAG; // don't want 0, and this is quick
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
{"unoutput", required_argument, NULL, 'U'},
{"use-index", no_argument, NULL, 'M'},
{"with-header", no_argument, NULL, 'h'},
+ {"sanitize", required_argument, NULL, 'z'},
};
/* parse command-line options */
char *tmp;
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pP",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:T:R:N:d:D:L:s:@:m:x:U:MXe:pPz:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
print_error("view", "Incorrect sampling argument \"%s\"", optarg);
goto view_end;
}
+ settings.count_rf |= SAM_QNAME;
break;
case LONGOPT('s'):
settings.subsam_frac = strtod(optarg, &tmp);
print_error("view", "Incorrect sampling argument \"%s\"", optarg);
goto view_end;
}
+ settings.count_rf |= SAM_QNAME;
break;
case LONGOPT('S'): settings.subsam_seed = atoi(optarg); break;
- case 'm': settings.min_qlen = atoi(optarg); break;
+ case 'm':
+ settings.min_qlen = atoi(optarg);
+ settings.count_rf |= SAM_SEQ;
+ break;
case 'c': settings.is_count = 1; break;
case 'S': break;
case 'b': out_format = "b"; break;
case 'o': settings.fn_out = strdup(optarg); break;
case 'U': settings.fn_un_out = strdup(optarg); break;
case 'X': has_index_file = 1; break;
- case 'f': settings.flag_on |= bam_str2flag(optarg); break;
- case 'F': settings.flag_off |= bam_str2flag(optarg); break;
+ case 'f':
+ settings.flag_on |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
+ case 'F':
+ settings.flag_off |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
case LONGOPT('g'):
- settings.flag_anyon |= bam_str2flag(optarg); break;
- case 'G': settings.flag_alloff |= bam_str2flag(optarg); break;
- case 'q': settings.min_mapQ = atoi(optarg); break;
+ settings.flag_anyon |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
+ case 'G':
+ settings.flag_alloff |= bam_str2flag(optarg);
+ settings.count_rf |= SAM_FLAG | SAM_RNEXT;
+ break;
+ case 'q':
+ settings.min_mapQ = atoi(optarg);
+ settings.count_rf |= SAM_MAPQ;
+ break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
- case 'l': settings.library = strdup(optarg); break;
+ case 'l':
+ settings.library = strdup(optarg);
+ settings.count_rf |= SAM_RGAUX;
+ break;
case 'p': settings.unmap = 1; break;
case 'P': settings.fetch_pairs = 1; settings.multi_region = 1; break;
+ case 'z':
+ if ((settings.sanitize = bam_sanitize_options(optarg)) < 0) {
+ ret = 1;
+ goto view_end;
+ }
+ break;
case LONGOPT('L'):
settings.multi_region = 1;
// fall through
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR;
break;
case 'r':
if (add_read_group_single("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_RGAUX;
break;
case 'R':
if (add_read_groups_file("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_RGAUX;
break;
case 'N':
if (add_read_names_file("view", &settings, optarg) != 0) {
ret = 1;
goto view_end;
}
+ settings.count_rf |= SAM_QNAME;
break;
+
case 'd':
if (strlen(optarg) < 2 || (strlen(optarg) > 2 && optarg[2] != ':')) {
- print_error_errno("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
+ print_error("view", "Invalid \"tag:value\" option: \"%s\"", optarg);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
+ // Some tag filtering affects other fields
+ if (memcmp(settings.tag, "NM", 2) == 0 ||
+ memcmp(settings.tag, "MD", 2) == 0)
+ settings.count_rf |= SAM_AUX | SAM_SEQ;
+ else if (memcmp(settings.tag, "RG", 2) == 0)
+ settings.count_rf |= SAM_RGAUX;
+ else
+ settings.count_rf |= SAM_AUX;
break;
+
case 'D':
// Allow ";" as delimiter besides ":" to support MinGW CLI POSIX
// path translation as described at:
// http://www.mingw.org/wiki/Posix_path_conversion
if (strlen(optarg) < 4 || (optarg[2] != ':' && optarg[2] != ';')) {
- print_error_errno("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
+ print_error("view", "Invalid \"tag:file\" option: \"%s\"", optarg);
ret = 1;
goto view_end;
}
ret = 1;
goto view_end;
}
+ // Some tag filtering affects other fields
+ if (memcmp(settings.tag, "NM", 2) == 0 ||
+ memcmp(settings.tag, "MD", 2) == 0)
+ settings.count_rf |= SAM_AUX | SAM_SEQ;
+ else if (memcmp(settings.tag, "RG", 2) == 0)
+ settings.count_rf |= SAM_RGAUX;
+ else
+ settings.count_rf |= SAM_AUX;
break;
+
case LONGOPT('?'):
return usage(samtools_stdout, EXIT_SUCCESS, 1);
case '?':
print_error("main_samview", "Couldn't initialise filter");
return 1;
}
+ settings.count_rf = INT_MAX; // no way to know what we need
break;
case LONGOPT('r'): settings.remove_flag |= bam_str2flag(optarg); break;
case LONGOPT('a'): settings.add_flag |= bam_str2flag(optarg); break;
case 'x':
if (*optarg == '^') {
- if (parse_aux_list(&settings.keep_tag, optarg+1))
+ if (parse_aux_list(&settings.keep_tag, optarg+1, "main_samview")) {
+ aux_list_free(&settings);
return usage(samtools_stderr, EXIT_FAILURE, 0);
+ }
} else {
- if (parse_aux_list(&settings.remove_tag, optarg))
+ if (parse_aux_list(&settings.remove_tag, optarg, "main_samview")) {
+ aux_list_free(&settings);
return usage(samtools_stderr, EXIT_FAILURE, 0);
+ }
}
break;
case LONGOPT('x'):
- if (parse_aux_list(&settings.keep_tag, optarg))
+ if (parse_aux_list(&settings.keep_tag, optarg, "main_samview")) {
+ aux_list_free(&settings);
return usage(samtools_stderr, EXIT_FAILURE, 0);
+ }
break;
default:
settings.unmap = 0; // Not valid in counting mode
}
- if (ga.nthreads > 1) {
+ if (ga.nthreads > 0) {
if (!(p.pool = hts_tpool_init(ga.nthreads))) {
fprintf(samtools_stderr, "Error creating thread pool\n");
ret = 1;
// Initialize BAM/CRAM index
char **regs = NULL;
int nregs = 0;
- if ( has_index_file && optind < argc - 2 ) regs = &argv[optind+2], nregs = argc - optind - 2, settings.fn_idx_in = argv[optind+1];
- else if ( !has_index_file && optind < argc - 1 ) regs = &argv[optind+1], nregs = argc - optind - 1;
- else if ( has_index_file )
- {
+ if ( has_index_file && optind <= argc - 2 ) {
+ regs = optind < argc-2 ? &argv[optind+2] : NULL;
+ nregs = argc - optind - 2;
+ settings.fn_idx_in = argv[optind+1];
+ } else if (!has_index_file && optind < argc - 1 ) {
+ regs = &argv[optind+1];
+ nregs = argc - optind - 1;
+ } else if ( has_index_file && argc-optind < 2) {
print_error("view", "Incorrect number of arguments for -X option. Aborting.");
return 1;
}
+ if (regs)
+ settings.count_rf |= SAM_POS | SAM_RNAME | SAM_CIGAR;
+
if ( settings.fn_idx_in || nregs || settings.multi_region )
{
settings.hts_idx = settings.fn_idx_in ? sam_index_load2(settings.in, settings.fn_in, settings.fn_idx_in) : sam_index_load(settings.in, settings.fn_in);
}
}
+ if (settings.is_count)
+ // Won't fail, but also wouldn't matter if it did
+ hts_set_opt(settings.in, CRAM_OPT_REQUIRED_FIELDS, settings.count_rf);
+
if ( settings.fetch_pairs )
{
hts_itr_multi_t *iter = multi_region_init(&settings, regs, nregs);
ret = iter ? multi_region_view(&settings, iter) : 1;
if (ret) goto view_end;
}
- else if ( !settings.hts_idx ) // stream through the entire file
- {
+ else if ( !settings.hts_idx || optind+1 >= argc-has_index_file ) {
+ // stream through the entire file
ret = stream_view(&settings);
if (ret) goto view_end;
} else { // retrieve alignments in specified regions
free(settings.fn_un_out_idx);
free(arg_list);
- if (settings.keep_tag)
- kh_destroy(aux_exists, settings.keep_tag);
- if (settings.remove_tag)
- kh_destroy(aux_exists, settings.remove_tag);
+ aux_list_free(&settings);
return ret;
}
" Comma-separated read tags to preserve (repeatable) [null].\n"
" Equivalent to \"-x ^STR\"\n"
" -B, --remove-B Collapse the backward CIGAR operation\n"
+" -z, --sanitize FLAGS Perform sanitity checking and fixing on records.\n"
+" FLAGS is comma separated (see manual). [off]\n"
"\n"
"General options:\n"
" -?, --help Print long help, including note about region specification\n"
/* samtools.h -- utility routines.
- Copyright (C) 2013-2015, 2019 Genome Research Ltd.
+ Copyright (C) 2013-2015, 2019, 2023 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include "htslib/hts_defs.h"
#include "htslib/sam.h"
+#include "sam_utils.h"
const char *samtools_version(void);
-#define CHECK_PRINTF(fmt,args) HTS_FORMAT(HTS_PRINTF_FMT, (fmt), (args))
-
-void print_error(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3);
-void print_error_errno(const char *subcommand, const char *format, ...) CHECK_PRINTF(2, 3);
-
-void check_sam_close(const char *subcmd, samFile *fp, const char *fname, const char *null_fname, int *retp);
-
-/* Utility functions to register an output htsFile/samFile/vcfFile that
- * might be stdout. If FNAME is "-" or NULL, records FP so that print_error()
- * et al can automatically flush it before printing an error message.
- */
-void autoflush_if_stdout(htsFile *fp, const char *fname);
-
-/* Call this before closing FP; check_sam_close() does this automatically.
- */
-void release_autoflush(htsFile *fp);
-
-/*
- * Utility function to add an index to a file we've opened for write.
- * NB: Call this after writing the header and before writing sequences.
- *
- * The returned index filename should be freed by the caller, but only
- * after sam_idx_save has been called.
- *
- * Returns index filename on success,
- * NULL on failure.
- */
-char *auto_index(htsFile *fp, const char *fn, bam_hdr_t *header);
+/* BAM sanitizer options */
+#define FIX_POS 2
+#define FIX_MQUAL 4
+#define FIX_UNMAP 8
+#define FIX_CIGAR 16
+#define FIX_AUX 32
+
+// default for position sorted data
+#define FIX_ON (FIX_MQUAL|FIX_UNMAP|FIX_CIGAR|FIX_AUX)
+#define FIX_ALL 255
+
+// Parses a comma-separated list of "pos", "mqual", "unmap", "cigar", and "aux"
+// keywords for the bam sanitizer.
+int bam_sanitize_options(const char *str);
+
+// Sanitize a BAM record, using FIX_* bit flags as defined above.
+// Returns 0 on success,
+// <0 on failure.
+int bam_sanitize(sam_hdr_t *h, bam1_t *b, int flags);
#endif
/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Author: Sam Nicholls <sam@samnicholls.net>
uint64_t *insertions, *deletions;
uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
isize_t *isize;
+ uint64_t* mapping_qualities;
// The extremes encountered
int max_len; // Maximum read length
}
}
-void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos)
+void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos, hts_pos_t end)
{
int i;
hts_pos_t fai_ref_len;
- char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len);
+ char *fai_ref;
+
+ if (end < pos+stats->mrseq_buf-1)
+ end = pos+stats->mrseq_buf-1;
+ else if (stats->mrseq_buf < end - pos) {
+ size_t sz = end - pos;
+ uint8_t *new_rseq = realloc(stats->rseq_buf, sz);
+ if (!new_rseq)
+ error("Couldn't expand the reference sequence buffer\n");
+ stats->rseq_buf = new_rseq;
+ stats->mrseq_buf = sz;
+ }
+
+ fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len);
if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid));
uint8_t *ptr = stats->rseq_buf;
stats->max_len_1st = read_len;
if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len )
stats->max_len_2nd = read_len;
+ if ( ( bam_line->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FSUPPLEMENTARY|BAM_FQCFAIL|BAM_FDUP) ) == 0 )
+ stats->mapping_qualities[bam_line->core.qual]++;
int i;
int gc_count = 0;
// 20kbp, so the effect is negligible.
if ( stats->info->fai )
{
- int inc_ref = 0, inc_gcd = 0;
- // First pass or new chromosome
- if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
- // Read goes beyond the end of the rseq buffer
- else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+ hts_pos_t inc_ref = 0;
+ int inc_gcd = 0;
+ // First pass or new chromosome, or read goes beyond the rseq buffer
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid
+ || stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen) {
+ inc_ref=bam_line->core.pos+readlen;
+ inc_gcd=1;
+ }
// Read overlaps the next gcd bin
else if ( stats->gcd_pos+stats->info->gcd_bin_size < bam_line->core.pos+readlen )
{
inc_gcd = 1;
- if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = 1;
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = bam_line->core.pos+stats->info->gcd_bin_size;
}
if ( inc_gcd )
{
if ( stats->igcd >= stats->ngcd )
realloc_gcd_buffer(stats, readlen);
if ( inc_ref )
- read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+ read_ref_seq(stats, bam_line->core.tid,
+ bam_line->core.pos, inc_ref);
stats->gcd_pos = bam_line->core.pos;
stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->info->gcd_bin_size);
}
void output_stats(FILE *to, stats_t *stats, int sparse)
{
// Calculate average insert size and standard deviation (from the main bulk data only)
- int isize, ibulk=0, icov;
+ int isize, ibulk=0, icov, imapq=0;
uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0, cov_sum=0;
double bulk=0, avg_isize=0, sd_isize=0;
for (isize=0; isize<stats->isize->nitems(stats->isize->data); isize++)
fprintf(to, "LRL\t%d\t%ld\n", ilen+1, (long)stats->read_lengths_2nd[ilen+1]);
}
+ fprintf(to, "# Mapping qualities for reads !(UNMAP|SECOND|SUPPL|QCFAIL|DUP). Use `grep ^MAPQ | cut -f 2-` to extract this part. The columns are: mapq, count\n");
+ for (imapq=0; imapq < 256; imapq++)
+ {
+ if ( stats->mapping_qualities[imapq]>0 )
+ fprintf(to, "MAPQ\t%d\t%ld\n", imapq, (long)stats->mapping_qualities[imapq]);
+ }
+
fprintf(to, "# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
for (ilen=0; ilen<stats->nindels; ilen++)
destroy_regions(stats);
if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash);
free(stats->split_name);
+ free(stats->mapping_qualities);
free(stats);
}
if (!stats->del_cycles_1st) goto nomem;
stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
if (!stats->del_cycles_2nd) goto nomem;
+ stats->mapping_qualities = calloc(256,sizeof(uint64_t));
+ if(!stats->mapping_qualities) goto nomem;
if (init_barcode_tags(stats) < 0)
goto nomem;
realloc_rseq_buffer(stats);
/* stats.c -- This is the former bamcheck integrated into samtools/htslib.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Author: Sam Nicholls <sam@samnicholls.net>
uint64_t *insertions, *deletions;
uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd;
isize_t *isize;
+ uint64_t* mapping_qualities;
// The extremes encountered
int max_len; // Maximum read length
}
}
-void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos)
+void read_ref_seq(stats_t *stats, int32_t tid, hts_pos_t pos, hts_pos_t end)
{
int i;
hts_pos_t fai_ref_len;
- char *fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len);
+ char *fai_ref;
+
+ if (end < pos+stats->mrseq_buf-1)
+ end = pos+stats->mrseq_buf-1;
+ else if (stats->mrseq_buf < end - pos) {
+ size_t sz = end - pos;
+ uint8_t *new_rseq = realloc(stats->rseq_buf, sz);
+ if (!new_rseq)
+ error("Couldn't expand the reference sequence buffer\n");
+ stats->rseq_buf = new_rseq;
+ stats->mrseq_buf = sz;
+ }
+
+ fai_ref = faidx_fetch_seq64(stats->info->fai, sam_hdr_tid2name(stats->info->sam_header, tid), pos, pos+stats->mrseq_buf-1, &fai_ref_len);
if ( fai_ref_len < 0 ) error("Failed to fetch the sequence \"%s\"\n", sam_hdr_tid2name(stats->info->sam_header, tid));
uint8_t *ptr = stats->rseq_buf;
stats->max_len_1st = read_len;
if ( order == READ_ORDER_LAST && stats->max_len_2nd < read_len )
stats->max_len_2nd = read_len;
+ if ( ( bam_line->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FSUPPLEMENTARY|BAM_FQCFAIL|BAM_FDUP) ) == 0 )
+ stats->mapping_qualities[bam_line->core.qual]++;
int i;
int gc_count = 0;
// 20kbp, so the effect is negligible.
if ( stats->info->fai )
{
- int inc_ref = 0, inc_gcd = 0;
- // First pass or new chromosome
- if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
- // Read goes beyond the end of the rseq buffer
- else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+ hts_pos_t inc_ref = 0;
+ int inc_gcd = 0;
+ // First pass or new chromosome, or read goes beyond the rseq buffer
+ if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid
+ || stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen) {
+ inc_ref=bam_line->core.pos+readlen;
+ inc_gcd=1;
+ }
// Read overlaps the next gcd bin
else if ( stats->gcd_pos+stats->info->gcd_bin_size < bam_line->core.pos+readlen )
{
inc_gcd = 1;
- if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = 1;
+ if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->info->gcd_bin_size ) inc_ref = bam_line->core.pos+stats->info->gcd_bin_size;
}
if ( inc_gcd )
{
if ( stats->igcd >= stats->ngcd )
realloc_gcd_buffer(stats, readlen);
if ( inc_ref )
- read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+ read_ref_seq(stats, bam_line->core.tid,
+ bam_line->core.pos, inc_ref);
stats->gcd_pos = bam_line->core.pos;
stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->info->gcd_bin_size);
}
void output_stats(FILE *to, stats_t *stats, int sparse)
{
// Calculate average insert size and standard deviation (from the main bulk data only)
- int isize, ibulk=0, icov;
+ int isize, ibulk=0, icov, imapq=0;
uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0, cov_sum=0;
double bulk=0, avg_isize=0, sd_isize=0;
for (isize=0; isize<stats->isize->nitems(stats->isize->data); isize++)
fprintf(to, "LRL\t%d\t%ld\n", ilen+1, (long)stats->read_lengths_2nd[ilen+1]);
}
+ fprintf(to, "# Mapping qualities for reads !(UNMAP|SECOND|SUPPL|QCFAIL|DUP). Use `grep ^MAPQ | cut -f 2-` to extract this part. The columns are: mapq, count\n");
+ for (imapq=0; imapq < 256; imapq++)
+ {
+ if ( stats->mapping_qualities[imapq]>0 )
+ fprintf(to, "MAPQ\t%d\t%ld\n", imapq, (long)stats->mapping_qualities[imapq]);
+ }
+
fprintf(to, "# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n");
for (ilen=0; ilen<stats->nindels; ilen++)
destroy_regions(stats);
if ( stats->rg_hash ) kh_destroy(rg, stats->rg_hash);
free(stats->split_name);
+ free(stats->mapping_qualities);
free(stats);
}
if (!stats->del_cycles_1st) goto nomem;
stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
if (!stats->del_cycles_2nd) goto nomem;
+ stats->mapping_qualities = calloc(256,sizeof(uint64_t));
+ if(!stats->mapping_qualities) goto nomem;
if (init_barcode_tags(stats) < 0)
goto nomem;
realloc_rseq_buffer(stats);
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.16.1
+VERSION=1.17
# If we have a git clone, then check against the current tag
if [ -e .git ]
tmp_vars = []
for var in ['CC', 'CFLAGS', 'LDFLAGS']:
if var in os.environ:
+ if var == 'CFLAGS' and 'CCSHARED' in sysconfig.get_config_vars():
+ os.environ[var] += ' ' + sysconfig.get_config_var('CCSHARED')
print("# pysam: (env) {}={}".format(var, os.environ[var]))
elif var in sysconfig.get_config_vars():
value = sysconfig.get_config_var(var)
define_macros = []
-suffix = sysconfig.get_config_var('EXT_SUFFIX')
-if not suffix:
- suffix = sysconfig.get_config_var('SO')
+suffix = sysconfig.get_config_var('EXT_SUFFIX') or sysconfig.get_config_var('SO')
internal_htslib_libraries = [
os.path.splitext("chtslib{}".format(suffix))[0]]
a.template_length = 167
a.query_qualities = pysam.qualitystring_to_array("1234") * 200
- return a
+ self.assertTrue(a)
def testUpdateTlen(self):
"""check if updating tlen works"""
"""Benchmarking module for AlignmentFile functionality"""
import os
import pysam
+import sys
import unittest
from TestUtils import make_data_files, BAM_DATADIR, IS_PYTHON3, force_str, flatten_nested_list
import PileupTestUtils
def tearDown(self):
self.samfile.close()
+ @unittest.skipIf(sys.version_info >= (3, 11), "exercises invalid accesses, which crashes with Python 3.11")
def testIteratorOutOfScope(self):
'''test if exception is raised if pileup col is accessed after
iterator is exhausted.'''
def test_total(self):
all_read_counts = self.samfile.count()
splice_sites = self.samfile.find_introns(self.samfile.fetch())
- # there is a single unspliced read in there
- self.assertEqual(sum(splice_sites.values()), all_read_counts - 1)
+ # there is a single unspliced read and a single unmapped read in there
+ self.assertEqual(sum(splice_sites.values()), all_read_counts - 2)
def test_first(self):
reads = list(self.samfile.fetch())[:10]
# clean up previous compilation
import os
-import unittest
+import pytest
import pysam
from TestUtils import make_data_files, BAM_DATADIR, TABIX_DATADIR
except OSError:
pass
-import pyximport
-pyximport.install(build_in_temp=False)
-import _compile_test
+NO_PYXIMPORT = False
+try:
+ import pyximport
+ pyximport.install(build_in_temp=False)
+ import _compile_test
+except:
+ NO_PYXIMPORT = True
-class BAMTest(unittest.TestCase):
+@pytest.mark.skipif(NO_PYXIMPORT, reason="no pyximport")
+def test_bam():
input_filename = os.path.join(BAM_DATADIR, "ex1.bam")
-
- def testCount(self):
-
- nread = _compile_test.testCountBAM(
- pysam.Samfile(self.input_filename))
- self.assertEqual(nread, 3270)
+ nread = _compile_test.testCountBAM(
+ pysam.Samfile(input_filename))
+ assert nread == 3270
-class GTFTest(unittest.TestCase):
+@pytest.mark.skipif(NO_PYXIMPORT, reason="no pyximport")
+def test_gtf():
input_filename = os.path.join(TABIX_DATADIR, "example.gtf.gz")
- def testCount(self):
- nread = _compile_test.testCountGTF(
- pysam.Tabixfile(self.input_filename))
- self.assertEqual(nread, 237)
-
-
-if __name__ == "__main__":
- unittest.main()
+ nread = _compile_test.testCountGTF(
+ pysam.Tabixfile(input_filename))
+ assert nread == 237
HWI-C00113:131:HMHYWADXX:2:1204:13994:2816 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC ?@@DDDDDHHHHFHEFAABA@?FGBEFHIIIHH>DB@DHIHIDD>@@GHID NH:i:7 HI:i:6 AS:i:49 nM:i:0
HWI-C00113:131:HMHYWADXX:2:1212:15591:47491 272 1 17031 0 25M177N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCTGTTCCAGCTGCTCTCTCTTGC @@C+ADDDDHFFDEGEGIIIDFHIFHIIIIIGEHIIBH>FGGGHGHFGGII NH:i:7 HI:i:6 AS:i:49 nM:i:0
HWI-C00113:131:HMHYWADXX:2:2215:10125:81395 272 1 17031 0 25M859N26M * 0 0 GCACATAGAAGTAGTTCTCTGGGACCTGCAGGGCCCGCTCGTCCAGGGGGC CCCFFFFFGHHHHJJJJJJJJJHJJJJJJIJIIJJJHIJJJJJJJJJIJHE NH:i:6 HI:i:1 AS:i:49 nM:i:0
+HWI-C00113:131:HMHYWADXX:PLACED:UNMAPPED 4 1 17032 0 * * 0 0 ATGC HIJK
HWI-C00113:131:HMHYWADXX:1:2102:9065:90529 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG C@CFFFFFHHHHHJJJJJJJJJJJJJJJJJJJJJJFHIFHIJIJJJJJJJJ NH:i:5 HI:i:2 AS:i:47 nM:i:0
HWI-C00113:131:HMHYWADXX:1:2204:7767:77376 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@@FDFDDBFHADEHEIGIGIJIGHIHG?EDGHGGCFH:B?BD@FGFHGIH NH:i:5 HI:i:2 AS:i:47 nM:i:0
HWI-C00113:131:HMHYWADXX:2:1212:6793:42000 16 1 17033 0 2S23M550N26M * 0 0 GTACATAGAAGTAGTTCTCTGGGACAGGTTCTCGGTGGTGTTGAAGAGCAG @@?DADBD8CFADGFHIIIIE3A<EC:EHGGGIIB8?80?DDH>9?<FGCD NH:i:5 HI:i:2 AS:i:47 nM:i:0
infile = self.open()
i = self.iterator(infile, self.parser())
- x = i.next()
+ x = next(i)
infile.close()
# Not implemented
# self.assertRaises(ValueError, i.next)
iterator = self.iterator(
infile,
parser=self.parser())
- self.assertRaises(ValueError, iterator.next)
+ self.assertRaises(ValueError, iterator.__next__)
def testGTFTooFewFields(self):
iterator = self.iterator(
infile,
parser=self.parser())
- self.assertRaises(ValueError, iterator.next)
+ self.assertRaises(ValueError, iterator.__next__)
class TestBed(unittest.TestCase):
# two iterators working on the same file
with pysam.TabixFile(self.filename) as tabix:
- a = tabix.fetch(parser=pysam.asGTF()).next()
- b = tabix.fetch(parser=pysam.asGTF()).next()
+ a = next(tabix.fetch(parser=pysam.asGTF()))
+ b = next(tabix.fetch(parser=pysam.asGTF()))
# the first two lines differ only by the feature field
self.assertEqual(a.feature, "UTR")
self.assertEqual(b.feature, "exon")
def testDisjointIterators(self):
# two iterators working on the same file
with pysam.TabixFile(self.filename) as tabix:
- a = tabix.fetch(parser=pysam.asGTF(),
- multiple_iterators=True).next()
- b = tabix.fetch(parser=pysam.asGTF(),
- multiple_iterators=True).next()
+ a = next(tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True))
+ b = next(tabix.fetch(parser=pysam.asGTF(), multiple_iterators=True))
# both iterators are at top of file
self.assertEqual(str(a), str(b))
# technically it does not really test if the scope is correct
i = _TestMultipleIteratorsHelper(self.filename,
multiple_iterators=True)
- self.assertTrue(i.next())
+ self.assertTrue(next(i))
i = _TestMultipleIteratorsHelper(self.filename,
multiple_iterators=False)
- self.assertRaises(IOError, i.next)
+ self.assertRaises(IOError, i.__next__)
def testDoubleFetch(self):
os.unlink(tmpfilename)
def testCopy(self):
- a = self.tabix.fetch(parser=pysam.asTuple()).next()
+ a = next(self.tabix.fetch(parser=pysam.asTuple()))
b = copy.copy(a)
self.assertEqual(a, b)
- a = self.tabix.fetch(parser=pysam.asGTF()).next()
+ a = next(self.tabix.fetch(parser=pysam.asGTF()))
b = copy.copy(a)
self.assertEqual(a, b)
def test_setting_fields(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.contig = r.contig + "_test_contig"
r.source = r.source + "_test_source"
def test_setAttribute_makes_changes(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.setAttribute("transcript_id", "abcd")
sr = str(r)
self.assertEqual(r.transcript_id, "abcd")
self.assertTrue("transcript_id \"abcd\"" in sr)
def test_added_attribute_is_output(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.new_int_attribute = 12
self.assertTrue("new_int_attribute 12" in str(r).split("\t")[8])
def test_setting_start_is_one_based(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.start = 1800
self.assertEqual(r.start, 1800)
self.assertEqual(str(r).split("\t")[3], "1801")
def test_setting_end_is_one_based(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.end = 2100
self.assertEqual(r.end, 2100)
self.assertEqual(str(r).split("\t")[4], "2100")
def test_setting_frame_to_none_produces_dot(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.frame = None
self.assertEqual(str(r).split("\t")[7], ".")
r.frame = 2
self.assertEqual(str(r).split("\t")[7], "2")
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.frame = "."
self.assertEqual(r.frame, None)
self.assertEqual(str(r).split("\t")[7], ".")
def test_setting_source_to_none_produces_dot(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.source = None
self.assertEqual(str(r).split("\t")[1], ".")
r.source = "source"
self.assertEqual(str(r).split("\t")[1], "source")
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.source = "."
self.assertEqual(r.source, None)
self.assertEqual(str(r).split("\t")[1], ".")
def test_setting_feature_to_none_produces_dot(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.feature = None
self.assertEqual(str(r).split("\t")[2], ".")
r.feature = "feature"
self.assertEqual(str(r).split("\t")[2], "feature")
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.feature = "."
self.assertEqual(r.feature, None)
self.assertEqual(str(r).split("\t")[2], ".")
def test_setting_strand_to_none_produces_dot(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.strand = None
self.assertEqual(str(r).split("\t")[6], ".")
r.strand = "-"
self.assertEqual(str(r).split("\t")[6], "-")
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.strand = "."
self.assertEqual(r.strand, None)
self.assertEqual(str(r).split("\t")[6], ".")
def test_setting_score_to_none_produces_dot(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.score = None
self.assertEqual(str(r).split("\t")[5], ".")
r.score = -12.0
self.assertEqual(str(r).split("\t")[5], "-12.0")
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.score = "."
self.assertEqual(r.score, None)
self.assertEqual(str(r).split("\t")[5], ".")
self.assertEqual(str(r).split("\t")[5], "-12")
def test_asdict_contains_attributes(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
d = r.to_dict()
c = self.compare[0]
s = self.build_attribute_string(d)
self.assertEqual(s, c[8])
def test_asdict_can_be_modified(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
d = r.to_dict()
d["gene_id"] = "new_gene_id"
self.assertTrue("gene_id \"new_gene_id\"", str(r))
def test_setAttribute_makes_changes(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.setAttribute("transcript_id", "abcd")
sr = str(r)
self.assertEqual(r.transcript_id, "abcd")
self.assertTrue("transcript_id=abcd" in sr)
def test_added_attribute_is_output(self):
- r = self.tabix.fetch(parser=self.parser()).next()
+ r = next(self.tabix.fetch(parser=self.parser()))
r.new_int_attribute = 12
self.assertTrue("new_int_attribute=12" in str(r).split("\t")[8])
--- /dev/null
+# content of: tox.ini , put in same dir as setup.py
+[tox]
+envlist = py36 py311
+
+[testenv]
+deps = pytest # install pytest in the virtualenv where commands will be executed
+commands =
+ pytest tests