Release notes
=============
+Release 0.20.0
+==============
+
+This release wraps htslib/bcftools version 1.16 and samtools version 1.16.1.
+
+* [#1113] Full compatibility with setuptools v62.1.0's build directory name changes
+* [#1121] Build-time symbol check portability improved
+* [#1122] Fix setting sample genotype using .alleles property
+* [#1128] Fix test suite failure when using a libdeflate-enabled samtools
+
+Many additional type hints have been provided by the community,
+thanks!
+
Release 0.19.1
==============
compilation options. Especially for OS X this will potentially save a
lot of trouble.
-The current version of pysam wraps 3rd-party code from htslib-1.15.1, samtools-1.15.1, and bcftools-1.15.1.
+The current version of pysam wraps 3rd-party code from htslib-1.16, samtools-1.16.1, and bcftools-1.16.
Pysam is available through `pypi
<https://pypi.python.org/pypi/pysam>`_. To install, type::
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
free(bca->bases); free(bca->inscns); free(bca);
}
+static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref)
+{
+ uint8_t *nm_tag = bam_aux_get(rec, "NM");
+ if ( !nm_tag ) return -1;
+ int64_t nm = bam_aux2i(nm_tag);
+
+ // Count indels as single events, not as the number of inserted/deleted
+ // bases (which is what NM does). Add soft clips as mismatches.
+ int i;
+ for (i=0; i < rec->core.n_cigar; i++)
+ {
+ int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
+ if ( val==BAM_CSOFT_CLIP )
+ {
+ nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ }
+ else if ( val==BAM_CINS || val==BAM_CDEL )
+ {
+ val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ if ( val > 1 ) nm -= val - 1;
+ }
+ }
+
+ // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
+ // http://www.genome.org/cgi/doi/10.1101/gr.239756.118
+ nm -= is_ref ? 1 : 2;
+
+ if ( nm < 0 ) nm = 0;
+ if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1;
+
+ return nm;
+}
+
// position in the sequence with respect to the aligned part of the read
static int get_position(const bam_pileup1_t *p, int *len,
int *sc_len, int *sc_dist) {
if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ if ( bca->fmt_flag&B2B_FMT_NMBZ )
+ {
+ memset(call->ref_nm,0,sizeof(*call->ref_nm)*(call->n+1)*B2B_N_NM);
+ memset(call->alt_nm,0,sizeof(*call->alt_nm)*(call->n+1)*B2B_N_NM);
+ }
+ else
+ {
+ memset(call->ref_nm,0,sizeof(*call->ref_nm)*B2B_N_NM);
+ memset(call->alt_nm,0,sizeof(*call->alt_nm)*B2B_N_NM);
+ }
memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
memset(bca->ref_scl, 0, 100*sizeof(int));
memset(bca->alt_scl, 0, 100*sizeof(int));
if (sc_len > 99) sc_len = 99;
}
}
-
int imq = mapQ * nqual_over_60;
int ibq = baseQ * nqual_over_60;
+ int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1);
if ( bam_is_rev(p->b) )
bca->rev_mqs[imq]++;
else
bca->fwd_mqs[imq]++;
- if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
+ if ( !is_diff )
{
bca->ref_pos[epos]++;
bca->ref_bq[ibq]++;
bca->ref_mq[imq]++;
bca->ref_scl[sc_len]++;
+ if ( inm>=0 )
+ {
+ bca->ref_nm[inm]++;
+ if ( r->ref_nm ) r->ref_nm[inm]++;
+ }
}
else
{
bca->alt_bq[ibq]++;
bca->alt_mq[imq]++;
bca->alt_scl[sc_len]++;
+ if ( inm>=0 )
+ {
+ bca->alt_nm[inm]++;
+ if ( r->alt_nm ) r->alt_nm[inm]++;
+ }
}
}
call->n_alleles = j;
if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
}
+ int has_alt = (call->n_alleles==2 && call->unseen!=-1) ? 0 : 1;
/*
* Set the phread likelihood array (call->PL) This array is 15 entries long
* for each sample because that is size of an upper or lower triangle of a
for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
}
+ // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
+ if ( !has_alt ) return 0;
+
calc_SegBias(calls, call);
// calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
// U z-normalised as +/- number of standard deviations from mean.
- if (call->ori_ref < 0) {
+ if (call->ori_ref < 0) { // indel
if (bca->fmt_flag & B2B_INFO_RPB)
call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
bca->npos, 0, 1);
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
100, 0,1);
}
+ call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1);
+ if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ {
+ for (i=0; i<n; i++)
+ {
+ float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
+ call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
+ }
+ }
} else {
// Old method; U as probability between 0 and 1
if ( bca->fmt_flag & B2B_INFO_RPB )
int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
{
extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
- int i, j, nals = 1;
+ int i, j, nals = 1, has_alt = 0;
bcf_hdr_t *hdr = bc->bcf_hdr;
rec->rid = bc->tid;
for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
}
nals++;
+ has_alt = 1;
}
}
else // SNP
if (bc->a[i] < 0) break;
kputc(',', &bc->tmp);
if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
- else kputc("ACGT"[bc->a[i]], &bc->tmp);
+ else
+ {
+ kputc("ACGT"[bc->a[i]], &bc->tmp);
+ has_alt = 1;
+ }
nals++;
}
}
bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
- if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
- if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
- if ( bc->mwu_sc != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
- } else {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
- }
+ if ( has_alt )
+ {
+ if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( bc->mwu_nm[0] != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
+ if ( bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ } else {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ }
- if ( bc->strand_bias != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+ if ( bc->strand_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
#if CDF_MWU_TESTS
- if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
- if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
- if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
- if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
+ if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
+ if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
+ if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
+ if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
#endif
+ }
+
tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
if ( fmt_flag&B2B_FMT_QS )
bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
+ if ( has_alt )
+ {
+ if ( fmt_flag&B2B_FMT_NMBZ )
+ bcf_update_format_float(hdr, rec, "NMBZ", bc->mwu_nm+1, rec->n_sample);
+ }
+
return 0;
}
/* bam2bcf.c -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
free(bca->bases); free(bca->inscns); free(bca);
}
+static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref)
+{
+ uint8_t *nm_tag = bam_aux_get(rec, "NM");
+ if ( !nm_tag ) return -1;
+ int64_t nm = bam_aux2i(nm_tag);
+
+ // Count indels as single events, not as the number of inserted/deleted
+ // bases (which is what NM does). Add soft clips as mismatches.
+ int i;
+ for (i=0; i < rec->core.n_cigar; i++)
+ {
+ int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
+ if ( val==BAM_CSOFT_CLIP )
+ {
+ nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ }
+ else if ( val==BAM_CINS || val==BAM_CDEL )
+ {
+ val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
+ if ( val > 1 ) nm -= val - 1;
+ }
+ }
+
+ // Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
+ // http://www.genome.org/cgi/doi/10.1101/gr.239756.118
+ nm -= is_ref ? 1 : 2;
+
+ if ( nm < 0 ) nm = 0;
+ if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1;
+
+ return nm;
+}
+
// position in the sequence with respect to the aligned part of the read
static int get_position(const bam_pileup1_t *p, int *len,
int *sc_len, int *sc_dist) {
if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
+ if ( bca->fmt_flag&B2B_FMT_NMBZ )
+ {
+ memset(call->ref_nm,0,sizeof(*call->ref_nm)*(call->n+1)*B2B_N_NM);
+ memset(call->alt_nm,0,sizeof(*call->alt_nm)*(call->n+1)*B2B_N_NM);
+ }
+ else
+ {
+ memset(call->ref_nm,0,sizeof(*call->ref_nm)*B2B_N_NM);
+ memset(call->alt_nm,0,sizeof(*call->alt_nm)*B2B_N_NM);
+ }
memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
memset(bca->ref_scl, 0, 100*sizeof(int));
memset(bca->alt_scl, 0, 100*sizeof(int));
if (sc_len > 99) sc_len = 99;
}
}
-
int imq = mapQ * nqual_over_60;
int ibq = baseQ * nqual_over_60;
+ int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1);
if ( bam_is_rev(p->b) )
bca->rev_mqs[imq]++;
else
bca->fwd_mqs[imq]++;
- if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
+ if ( !is_diff )
{
bca->ref_pos[epos]++;
bca->ref_bq[ibq]++;
bca->ref_mq[imq]++;
bca->ref_scl[sc_len]++;
+ if ( inm>=0 )
+ {
+ bca->ref_nm[inm]++;
+ if ( r->ref_nm ) r->ref_nm[inm]++;
+ }
}
else
{
bca->alt_bq[ibq]++;
bca->alt_mq[imq]++;
bca->alt_scl[sc_len]++;
+ if ( inm>=0 )
+ {
+ bca->alt_nm[inm]++;
+ if ( r->alt_nm ) r->alt_nm[inm]++;
+ }
}
}
call->n_alleles = j;
if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
}
+ int has_alt = (call->n_alleles==2 && call->unseen!=-1) ? 0 : 1;
/*
* Set the phread likelihood array (call->PL) This array is 15 entries long
* for each sample because that is size of an upper or lower triangle of a
for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
}
+ // No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
+ if ( !has_alt ) return 0;
+
calc_SegBias(calls, call);
// calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
// U z-normalised as +/- number of standard deviations from mean.
- if (call->ori_ref < 0) {
+ if (call->ori_ref < 0) { // indel
if (bca->fmt_flag & B2B_INFO_RPB)
call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
bca->npos, 0, 1);
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
100, 0,1);
}
+ call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1);
+ if ( bca->fmt_flag & B2B_FMT_NMBZ )
+ {
+ for (i=0; i<n; i++)
+ {
+ float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
+ call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
+ }
+ }
} else {
// Old method; U as probability between 0 and 1
if ( bca->fmt_flag & B2B_INFO_RPB )
int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
{
extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
- int i, j, nals = 1;
+ int i, j, nals = 1, has_alt = 0;
bcf_hdr_t *hdr = bc->bcf_hdr;
rec->rid = bc->tid;
for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
}
nals++;
+ has_alt = 1;
}
}
else // SNP
if (bc->a[i] < 0) break;
kputc(',', &bc->tmp);
if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
- else kputc("ACGT"[bc->a[i]], &bc->tmp);
+ else
+ {
+ kputc("ACGT"[bc->a[i]], &bc->tmp);
+ has_alt = 1;
+ }
nals++;
}
}
bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
- if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
- if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
-
- if (bca->fmt_flag & B2B_INFO_ZSCORE) {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
- if ( bc->mwu_sc != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
- } else {
- if ( bc->mwu_pos != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
- if ( bc->mwu_mq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
- if ( bc->mwu_mqs != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
- if ( bc->mwu_bq != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
- }
+ if ( has_alt )
+ {
+ if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+
+ if (bca->fmt_flag & B2B_INFO_ZSCORE) {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
+ if ( bc->mwu_nm[0] != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
+ if ( bc->mwu_sc != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
+ } else {
+ if ( bc->mwu_pos != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+ }
- if ( bc->strand_bias != HUGE_VAL )
- bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
+ if ( bc->strand_bias != HUGE_VAL )
+ bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
#if CDF_MWU_TESTS
- if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
- if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
- if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
- if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
+ if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
+ if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
+ if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
+ if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
#endif
+ }
+
tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
if ( fmt_flag&B2B_FMT_QS )
bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
+ if ( has_alt )
+ {
+ if ( fmt_flag&B2B_FMT_NMBZ )
+ bcf_update_format_float(hdr, rec, "NMBZ", bc->mwu_nm+1, rec->n_sample);
+ }
+
return 0;
}
/* bam2bcf.h -- variant calling.
Copyright (C) 2010-2012 Broad Institute.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#define B2B_INFO_RPB (1<<15)
#define B2B_FMT_QS (1<<16)
#define B2B_INFO_SCB (1<<17)
+#define B2B_FMT_NMBZ (1<<18) // per-sample NMBZ
#define B2B_INFO_ZSCORE (1<<30) // MWU as-is or Z-normalised
#define B2B_MAX_ALLELES 5
+#define B2B_N_NM 32 // number of NMBZ bins, i.e. max number of mismatches
+
#define B2B_DROP 0
#define B2B_INC_AD 1
errmod_t *e;
void *rghash;
float indel_bias; // adjusts indel score threshold; lower => call more.
+ int32_t *ref_nm, *alt_nm; // pointers to bcf_call_t.{ref_nm,alt_nm}
} bcf_callaux_t;
// per-sample values
uint32_t ori_depth; // ori_depth = anno[0..3] but before --min-BQ is applied
unsigned int mq0;
int32_t *ADF, *ADR, SCR, *QS; // FMT/QS
+ int32_t *ref_nm, *alt_nm;
// The fields are:
// depth fwd .. ref (0) and non-ref (2)
// depth rev .. ref (1) and non-ref (3)
int n_supp; // number of supporting non-reference reads
double anno[16];
unsigned int depth, ori_depth, mq0;
- int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS;
+ int32_t *PL, *DP4, *ADR, *ADF, *SCR, *QS, *ref_nm, *alt_nm;
uint8_t *fmt_arr;
float vdb; // variant distance bias
- float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc;
+ float mwu_pos, mwu_mq, mwu_bq, mwu_mqs, mwu_sc, *mwu_nm;
#if CDF_MWU_TESTS
float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
#endif
/* The MIT License
- Copyright (c) 2014-2021 Genome Research Ltd.
+ Copyright (c) 2014-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
}
args_t;
+static void destroy_chain(chain_t *chain)
+{
+ if ( !chain ) return;
+ free(chain->ref_gaps);
+ free(chain->alt_gaps);
+ free(chain->block_lengths);
+ free(chain);
+}
static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
{
-// fprintf(stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
+ if ( chain ) destroy_chain(chain);
chain = (chain_t*) calloc(1,sizeof(chain_t));
chain->num = 0;
chain->block_lengths = NULL;
return chain;
}
-static void destroy_chain(args_t *args)
-{
- chain_t *chain = args->chain;
- free(chain->ref_gaps);
- free(chain->alt_gaps);
- free(chain->block_lengths);
- free(chain);
- chain = NULL;
- free(args->chr);
- args->chr = NULL;
-}
-
static void print_chain(args_t *args)
{
/*
- alt_start (same as ref_start, as no edits are recorded/applied before that position)
- alt_end (adjusted to match the length of the alt sequence)
- chain_num (just an auto-increment id)
-
+
the other (sorted) lines are:
- length of the ungapped alignment block
- gap on the ref sequence between this and the next block (all but the last line)
static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
{
-// fprintf(stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
+ // fprintf(stderr, "push_chain_gap(chain=%p, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", chain, ref_start, ref_len, alt_start, alt_len);
int num = chain->num;
if (num && ref_start <= chain->ref_last_block_ori) {
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
+ destroy_chain(args->chain);
}
static void init_region(args_t *args, char *line)
bcf_sr_seek(args->files,line,args->fa_ori_pos);
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
- if (args->chain_fname )
- {
+ if ( args->chain_fname )
args->chain = init_chain(args->chain, args->fa_ori_pos);
- } else {
- args->chain = NULL;
- }
}
static bcf1_t **next_vcf_line(args_t *args)
if ( !args->missing_allele ) return;
ialt = -1;
}
- else
+ else
{
if ( !warned_haplotype )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
- else
+ else
ialt = bcf_gt_allele(ialt);
}
}
- else if ( action==use_iupac )
+ else if ( action==use_iupac )
{
ialt = -1;
int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
fprintf(stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
return;
}
-
+
}
char *alt_allele = rec->d.allele[ialt];
}
}
}
- if ( idx>=args->fa_buf.l )
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
if ( fail )
{
char tmp = 0;
- if ( args->fa_buf.l - idx > rec->rlen )
- {
+ if ( args->fa_buf.l - idx > rec->rlen )
+ {
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
}
alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
- if ( args->mark_del && len_diff<0 )
+ if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
- if ( args->mark_del && len_diff<0 )
+ if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
if ( str.s[0]=='>' )
{
// new sequence encountered
- if (args->chain) {
- print_chain(args);
- destroy_chain(args);
- }
+ if ( args->chain ) print_chain(args);
+
// apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
bcf1_t **rec_ptr = NULL;
while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
apply_variant(args, rec);
}
- if (args->chain)
- {
- print_chain(args);
- destroy_chain(args);
- }
+ if (args->chain) print_chain(args);
if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
flush_fa_buffer(args, 0);
bgzf_close(fasta);
fprintf(stderr, " # in the form \">chr:from-to\".\n");
fprintf(stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
fprintf(stderr, "\n");
+ fprintf(stderr, " # See also http://samtools.github.io/bcftools/howtos/consensus-sequence.html\n");
+ fprintf(stderr, "\n");
exit(1);
}
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
- static struct option loptions[] =
+ static struct option loptions[] =
{
{"mark-del",required_argument,NULL,1},
{"mark-ins",required_argument,NULL,2},
int c;
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
- switch (c)
+ switch (c)
{
case 1 : args->mark_del = optarg[0]; break;
case 2 :
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
- case 'e':
+ case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i':
+ case 'i':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
args->absent_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
break;
- case 'M':
- args->missing_allele = optarg[0];
+ case 'M':
+ args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
break;
case 'c': args->chain_fname = optarg; break;
- case 'H':
+ case 'H':
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
/* The MIT License
- Copyright (c) 2014-2021 Genome Research Ltd.
+ Copyright (c) 2014-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
}
args_t;
+static void destroy_chain(chain_t *chain)
+{
+ if ( !chain ) return;
+ free(chain->ref_gaps);
+ free(chain->alt_gaps);
+ free(chain->block_lengths);
+ free(chain);
+}
static chain_t* init_chain(chain_t *chain, int ref_ori_pos)
{
-// fprintf(bcftools_stderr, "init_chain(*chain, ref_ori_pos=%d)\n", ref_ori_pos);
+ if ( chain ) destroy_chain(chain);
chain = (chain_t*) calloc(1,sizeof(chain_t));
chain->num = 0;
chain->block_lengths = NULL;
return chain;
}
-static void destroy_chain(args_t *args)
-{
- chain_t *chain = args->chain;
- free(chain->ref_gaps);
- free(chain->alt_gaps);
- free(chain->block_lengths);
- free(chain);
- chain = NULL;
- free(args->chr);
- args->chr = NULL;
-}
-
static void print_chain(args_t *args)
{
/*
- alt_start (same as ref_start, as no edits are recorded/applied before that position)
- alt_end (adjusted to match the length of the alt sequence)
- chain_num (just an auto-increment id)
-
+
the other (sorted) lines are:
- length of the ungapped alignment block
- gap on the ref sequence between this and the next block (all but the last line)
static void push_chain_gap(chain_t *chain, int ref_start, int ref_len, int alt_start, int alt_len)
{
-// fprintf(bcftools_stderr, "push_chain_gap(*chain, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", ref_start, ref_len, alt_start, alt_len);
+ // fprintf(bcftools_stderr, "push_chain_gap(chain=%p, ref_start=%d, ref_len=%d, alt_start=%d, alt_len=%d)\n", chain, ref_start, ref_len, alt_start, alt_len);
int num = chain->num;
if (num && ref_start <= chain->ref_last_block_ori) {
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
+ destroy_chain(args->chain);
}
static void init_region(args_t *args, char *line)
bcf_sr_seek(args->files,line,args->fa_ori_pos);
if ( tmp_ptr ) *tmp_ptr = tmp;
fprintf(args->fp_out,">%s%s\n",args->chr_prefix?args->chr_prefix:"",line);
- if (args->chain_fname )
- {
+ if ( args->chain_fname )
args->chain = init_chain(args->chain, args->fa_ori_pos);
- } else {
- args->chain = NULL;
- }
}
static bcf1_t **next_vcf_line(args_t *args)
if ( !args->missing_allele ) return;
ialt = -1;
}
- else
+ else
{
if ( !warned_haplotype )
{
if ( !args->missing_allele ) return;
ialt = -1;
}
- else
+ else
ialt = bcf_gt_allele(ialt);
}
}
- else if ( action==use_iupac )
+ else if ( action==use_iupac )
{
ialt = -1;
int is_missing = 0, alen = 0, mlen = 0, fallback_alt = -1;
fprintf(bcftools_stderr,"The site %s:%"PRId64" overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1);
return;
}
-
+
}
char *alt_allele = rec->d.allele[ialt];
}
}
}
- if ( idx>=args->fa_buf.l )
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%"PRId64" .. idx=%d, ori_pos=%d, len=%"PRIu64", off=%d\n",bcf_seqname(args->hdr,rec),(int64_t) rec->pos+1,idx,args->fa_ori_pos,(uint64_t)args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
if ( fail )
{
char tmp = 0;
- if ( args->fa_buf.l - idx > rec->rlen )
- {
+ if ( args->fa_buf.l - idx > rec->rlen )
+ {
tmp = args->fa_buf.s[idx+rec->rlen];
args->fa_buf.s[idx+rec->rlen] = 0;
}
alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
- if ( args->mark_del && len_diff<0 )
+ if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
alen = strlen(alt_allele);
len_diff = alen - rec->rlen;
- if ( args->mark_del && len_diff<0 )
+ if ( args->mark_del && len_diff<0 )
{
alt_allele = mark_del(rec->d.allele[0], rec->rlen, alt_allele, args->mark_del);
alen = rec->rlen;
if ( str.s[0]=='>' )
{
// new sequence encountered
- if (args->chain) {
- print_chain(args);
- destroy_chain(args);
- }
+ if ( args->chain ) print_chain(args);
+
// apply all cached variants and variants that might have been missed because of short fasta (see test/consensus.9.*)
bcf1_t **rec_ptr = NULL;
while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
apply_variant(args, rec);
}
- if (args->chain)
- {
- print_chain(args);
- destroy_chain(args);
- }
+ if (args->chain) print_chain(args);
if ( args->absent_allele ) apply_absent(args, HTS_POS_MAX);
flush_fa_buffer(args, 0);
bgzf_close(fasta);
fprintf(bcftools_stderr, " # in the form \">chr:from-to\".\n");
fprintf(bcftools_stderr, " samtools faidx ref.fa 8:11870-11890 | bcftools consensus in.vcf.gz > out.fa\n");
fprintf(bcftools_stderr, "\n");
+ fprintf(bcftools_stderr, " # See also http://samtools.github.io/bcftools/howtos/consensus-sequence.html\n");
+ fprintf(bcftools_stderr, "\n");
bcftools_exit(1);
}
args_t *args = (args_t*) calloc(1,sizeof(args_t));
args->argc = argc; args->argv = argv;
- static struct option loptions[] =
+ static struct option loptions[] =
{
{"mark-del",required_argument,NULL,1},
{"mark-ins",required_argument,NULL,2},
int c;
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:M:p:a:",loptions,NULL)) >= 0)
{
- switch (c)
+ switch (c)
{
case 1 : args->mark_del = optarg[0]; break;
case 2 :
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'I': args->output_iupac = 1; break;
- case 'e':
+ case 'e':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
- case 'i':
+ case 'i':
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");
args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
args->absent_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -a, got \"%s\"\n", optarg);
break;
- case 'M':
- args->missing_allele = optarg[0];
+ case 'M':
+ args->missing_allele = optarg[0];
if ( optarg[1]!=0 ) error("Expected single character with -M, got \"%s\"\n", optarg);
break;
case 'c': args->chain_fname = optarg; break;
- case 'H':
+ case 'H':
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
}
else if ( bcf_gt_is_missing(ptr[0]) )
{
- if ( ptr[1]==bcf_int8_vector_end )
+ if ( ptr[1]==bcf_int8_vector_end )
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
- else
- {
+ else
+ {
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
}
}
if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str);
- else
+ else
{
double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5);
pval *= 2;
- assert( pval-1 < 1e-10 );
if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
else
pval = -4.34294481903*log(pval);
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
- else if ( !strcmp(str.s, "TBCSQ") )
+ else if ( !strcmp(str.s, "TBCSQ") )
{
fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ);
fmt->subscript = parse_subscript(&q);
if ( fmt->subscript==-1 )
- {
+ {
if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
}
else fmt->subscript++;
else
{
_SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
- else if ( !strcmp(str.s, "ALT") )
+ else if ( !strcmp(str.s, "ALT") )
{
fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
fmt->subscript = parse_subscript(&q);
str->l = 0;
for (i=0; i<convert->nfmt; i++)
{
- // Genotype fields.
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
// anything to the string, we trim all genotype fields enclosed in square
// brackets here. This may be changed in future, time will show...
size_t l_start = str->l;
-
+
for (k=i; k<j; k++)
{
if ( convert->fmt[k].type == T_MASK )
va_list args;
va_start(args, opt);
- switch (opt)
+ switch (opt)
{
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
}
else if ( bcf_gt_is_missing(ptr[0]) )
{
- if ( ptr[1]==bcf_int8_vector_end )
+ if ( ptr[1]==bcf_int8_vector_end )
{
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
}
- else
- {
+ else
+ {
str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
}
}
if ( n[0]==n[1] ) kputc(n[0]==0 ? '.':'0', str);
- else
+ else
{
double pval = n[0] < n[1] ? kf_betai(n[1], n[0] + 1, 0.5) : kf_betai(n[0], n[1] + 1, 0.5);
pval *= 2;
- assert( pval-1 < 1e-10 );
if ( pval>=1 ) pval = 0; // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
else
pval = -4.34294481903*log(pval);
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, "SAMPLE", is_gtf, T_SAMPLE);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, "GT", is_gtf, T_GT);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, "GT", is_gtf, T_TGT);
- else if ( !strcmp(str.s, "TBCSQ") )
+ else if ( !strcmp(str.s, "TBCSQ") )
{
fmt_t *fmt = register_tag(convert, "BCSQ", is_gtf, T_TBCSQ);
fmt->subscript = parse_subscript(&q);
if ( fmt->subscript==-1 )
- {
+ {
if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
}
else fmt->subscript++;
else
{
_SET_NON_FORMAT_TAGS(register_tag, str.s, convert, str.s, is_gtf)
- else if ( !strcmp(str.s, "ALT") )
+ else if ( !strcmp(str.s, "ALT") )
{
fmt_t *fmt = register_tag(convert, str.s, is_gtf, T_ALT);
fmt->subscript = parse_subscript(&q);
str->l = 0;
for (i=0; i<convert->nfmt; i++)
{
- // Genotype fields.
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
// anything to the string, we trim all genotype fields enclosed in square
// brackets here. This may be changed in future, time will show...
size_t l_start = str->l;
-
+
for (k=i; k<j; k++)
{
if ( convert->fmt[k].type == T_MASK )
va_list args;
va_start(args, opt);
- switch (opt)
+ switch (opt)
{
case allow_undef_tags:
convert->allow_undef_tags = va_arg(args, int);
#define TOK_sMEDIAN 35
#define TOK_sSTDEV 36
#define TOK_sSUM 37
-#define TOK_IN 38 // contains, e.g. FILTER~"A"
-#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
+#define TOK_IN 38 // contains, e.g. FILTER~"A"
+#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
+#define TOK_MODULO 40 // %
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s %
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 };
#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently
static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok);
if ( tmp[0]=='-' ) break;
if ( tmp[0]=='/' ) break;
if ( tmp[0]=='~' ) break;
+ if ( tmp[0]=='%' ) break;
}
if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
- if ( tmp[0]=='[' ) square_brackets++;
+ if ( tmp[0]=='[' ) square_brackets++;
tmp++;
}
if ( tmp > *str )
if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; }
if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; }
if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; }
+ if ( tmp[0]=='%' ) { (*str) += 1; return TOK_MODULO; }
*len = tmp - (*str);
return TOK_VAL;
/*
Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
-
+
Based on jkb's staden code with some adjustments.
https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123
*/
}
static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
- int i;
- if ( rtok->tok_type==TOK_NOT_IN )
+ // the btok values contain FILTER ids obtained by parsing the user expression
+ int i,j;
+ if ( rtok->tok_type==TOK_NOT_IN ) // fail if the query expression is a subset of the VCF FILTER
{
- if ( !line->d.n_flt )
+ if ( !btok->nvalues ) // the query expression is ".", pass everything unless the VCF is also "."
+ {
+ if ( line->d.n_flt ) rtok->pass_site = 1;
+ return;
+ }
+ if ( !line->d.n_flt ) // no filters at this VCF line and the query expression has a value
{
- if ( atok->hdr_id==-1 ) return; // missing value
rtok->pass_site = 1;
- return; // no filter present, eval to true
+ return;
}
- for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) return;
- rtok->pass_site = 1;
+ for (j=0; j<btok->nvalues; j++) // some query expression value must be absent from VCF in order to pass
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
+ }
+ if ( j!=btok->nvalues ) rtok->pass_site = 1;
return;
}
else if ( rtok->tok_type==TOK_IN )
{
- if ( !line->d.n_flt )
+ if ( !btok->nvalues ) // the query expression is ".", fail everything unless the VCF is also "."
{
- if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
- return; // no filter present, eval to false
+ if ( !line->d.n_flt ) rtok->pass_site = 1;
+ return;
+ }
+ if ( !line->d.n_flt ) return; // no filters at this VCF line and the query expression has a value
+ for (j=0; j<btok->nvalues; j++) // all of the query values must be present in the VCF in order to pass
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
}
- for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
+ if ( j==btok->nvalues ) rtok->pass_site = 1;
return;
}
- else if ( rtok->tok_type==TOK_NE ) // exact match
+ else if ( rtok->tok_type==TOK_NE ) // require anything but exact match
{
- if ( !line->d.n_flt )
+ if ( btok->nvalues != line->d.n_flt )
{
- if ( atok->hdr_id==-1 ) return; // missing value
rtok->pass_site = 1;
- return; // no filter present, eval to true
+ return;
}
- if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present
- rtok->pass_site = 1;
+ if ( !btok->nvalues ) return;
+ for (j=0; j<btok->nvalues; j++) // some of the query values must be absent from the VCF in order to pass
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
+ }
+ if ( j!=btok->nvalues ) rtok->pass_site = 1;
return;
}
- else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present
+ else if ( rtok->tok_type==TOK_EQ ) // require exact match
{
- if ( !line->d.n_flt )
+ if ( btok->nvalues != line->d.n_flt ) return;
+ if ( !btok->nvalues )
+ {
+ rtok->pass_site = 1;
+ return;
+ }
+ for (j=0; j<btok->nvalues; j++) // all of the query values must be present in the VCF in order to pass
{
- if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
- return; // no filter present, eval to false
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
}
- if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
+ if ( j==btok->nvalues ) rtok->pass_site = 1;
return;
}
- else
+ else
error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
return;
}
if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
- if ( rtok->tok_type==TOK_EQ )
+ if ( rtok->tok_type==TOK_EQ )
rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
else if ( rtok->tok_type==TOK_NE )
rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
tok->nvalues = tok->str_value.l = 0;
return;
}
-
+
int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
if ( tok->str_value.m <= nvals1*nsmpl )
{
tok->nvalues = 0;
return;
}
-
+
int j,nmissing = 0;
#define BRANCH(type_t, is_vector_end) { \
for (i=0; i<line->n_sample; i++) \
static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
{
filters_set_ac(flt,line,tok);
- tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
+ tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
tok->nvalues = 1;
}
static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5);
pval *= 2;
- assert( pval-1 < 1e-10 );
if ( pval>1 ) pval = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
return pval;
if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
{
rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
- rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
+ rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
int i;
for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
memset(rtok->pass_samples, 0, rtok->nsamples);
}
-#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
+#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP,TYPE) \
{ \
token_t *rtok = _rtok; \
int i, has_values = 0; \
continue; \
} \
has_values = 1; \
- rtok->values[i] = atok->values[i] AOP btok->values[i]; \
+ rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \
} \
} \
else if ( atok->nsamples ) \
continue; \
} \
has_values = 1; \
- rtok->values[i] = atok->values[i] AOP btok->values[0]; \
+ rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[0]; \
} \
} \
} \
continue; \
} \
has_values = 1; \
- rtok->values[i] = atok->values[0] AOP btok->values[i]; \
+ rtok->values[i] = TYPE atok->values[0] AOP TYPE btok->values[i]; \
} \
} \
} \
for (i=0; i<rtok->nsamples; i++)
{
if ( !rtok->usmpl[i] ) continue;
- rtok->pass_samples[i] = tok->pass_samples[i];
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
rtok->pass_site = 1;
return 2;
return;
}
if ( !regex )
- rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
+ rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
else
{
token_t *tok = atok->regex ? btok : atok;
{
if ( missing_logic[2] )
{
- for (i=0; i<rtok->nsamples; i++)
+ for (i=0; i<rtok->nsamples; i++)
if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
}
return;
return;
}
- // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
+ // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
token_t *xtok = atok->nsamples ? atok : btok;
token_t *ytok = atok->nsamples ? btok : atok;
assert( regex==ytok->regex );
if ( !list ) error("Could not read: %s\n", fname);
free(fname);
tok->nsamples = bcf_hdr_nsamples(hdr);
- tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
+ tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
for (i=0; i<nsmpl; i++)
{
int ismpl = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,list[i]);
}
else
{
- tok->idxs = idxs1;
+ tok->idxs = idxs1;
tok->nidxs = nidxs1;
tok->idx = idx1;
}
if ( set_samples )
{
tok->nsamples = bcf_hdr_nsamples(hdr);
- tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
+ tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
if ( idx1>=0 )
{
if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
{
tok->tok_type = TOK_VAL;
tok->threshold = bcf_hdr_nsamples(filter->hdr);
+ tok->is_constant = 1;
return 0;
}
else if ( !strncasecmp(str,"N_MISSING",len) )
}
if ( is_fmt==-1 ) is_fmt = 0;
}
- if ( is_array )
+ if ( is_array )
parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
- else if ( is_fmt && !tok->nsamples )
+ else if ( is_fmt && !tok->nsamples )
{
int i;
tok->nsamples = bcf_hdr_nsamples(filter->hdr);
- tok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ tok->usmpl = (uint8_t*) malloc(tok->nsamples);
for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
}
case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
}
- if (!is_array)
+ if (!is_array)
{
tok->idx = -2;
tok->idxs = (int*) malloc(sizeof(int));
char **env = NULL;
PERL_SYS_INIT3(&argc, &argv, &env);
}
-
+
filter->perl = perl_alloc();
PerlInterpreter *perl = filter->perl;
tok->hdr_id = -1;
tok->pass_site = -1;
tok->threshold = -1.0;
+ tok->is_constant = 1;
ret = TOK_MULT;
}
else if ( ret == -TOK_FUNC )
else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; }
else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
+ out[ival].is_constant = 1;
if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
out[ival].tag = out[ival].key; out[ival].key = NULL;
i = itok;
else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( strcmp(".",out[ival].key) )
+ token_t *tok = &out[ival];
+ char *bp = tok->key;
+ tok->nvalues = 0;
+ int has_missing = 0;
+ while ( *bp )
{
- out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
- if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
- error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
+ char tmp, *ep = bp;
+ while ( *ep && *ep!=';' ) ep++;
+ tmp = *ep;
+ *ep = 0;
+ if ( !strcmp(".",bp) ) has_missing = 1;
+ else
+ {
+ tok->nvalues++;
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, bp);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,id) )
+ error("The filter \"%s\" not present in the VCF header\n", bp);
+ tok->values[tok->nvalues-1] = id;
+ }
+ *ep = tmp;
+ if ( !tmp ) break;
+ bp = ep + 1;
}
- else
- out[ival].hdr_id = -1;
- out[ival].tag = out[ival].key; out[ival].key = NULL;
- out[itok].hdr_id = out[ival].hdr_id;
+ if ( has_missing && tok->nvalues ) error("The FILTER expression cannot contain missing value AND filters: \"%s\" (%d)\n",tok->key,tok->nvalues);
+ out[ival].tag = tok->key;
+ tok->key = NULL;
+ out[itok].hdr_id = tok->hdr_id;
continue;
}
}
kputs(filter->filters[i].key, &filter->filters[i].str_value);
filter->filters[i].nvalues = filter->filters[i].str_value.l;
}
- else // numeric constant
+ else if ( filter->filters[i].is_constant ) // numeric constant
{
filter->filters[i].values[0] = filter->filters[i].threshold;
filter->filters[i].nvalues = 1;
if ( filter->filters[i].tok_type == TOK_ADD )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+,(double));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_SUB )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-,(double));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_MULT )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*,(double));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_DIV )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/,(double));
+ filter->flt_stack[nstack-2] = &filter->filters[i];
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_MODULO )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],%,(int));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
#define TOK_sMEDIAN 35
#define TOK_sSTDEV 36
#define TOK_sSUM 37
-#define TOK_IN 38 // contains, e.g. FILTER~"A"
-#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
+#define TOK_IN 38 // contains, e.g. FILTER~"A"
+#define TOK_NOT_IN 39 // does not contain, e.g. FILTER!~"A"
+#define TOK_MODULO 40 // %
-// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
-// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s
-static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
+// ( ) [ < = > ] ! | & + - * / M m a A O ~ ^ S . l f c p b P i s %
+static int op_prec[] = {0,1,1,5,5,5,5,5,5,2,3, 6, 6, 7, 7, 8, 8, 8, 3, 2, 5, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7 };
#define TOKEN_STRING "x()[<=>]!|&+-*/MmaAO~^S.lfcpis" // this is only for debugging, not maintained diligently
static void cmp_vector_strings(token_t *atok, token_t *btok, token_t *rtok);
if ( tmp[0]=='-' ) break;
if ( tmp[0]=='/' ) break;
if ( tmp[0]=='~' ) break;
+ if ( tmp[0]=='%' ) break;
}
if ( tmp[0]==']' ) { if (square_brackets) tmp++; break; }
- if ( tmp[0]=='[' ) square_brackets++;
+ if ( tmp[0]=='[' ) square_brackets++;
tmp++;
}
if ( tmp > *str )
if ( tmp[0]=='*' ) { (*str) += 1; return TOK_MULT; }
if ( tmp[0]=='/' ) { (*str) += 1; return TOK_DIV; }
if ( tmp[0]=='~' ) { (*str) += 1; return TOK_LIKE; }
+ if ( tmp[0]=='%' ) { (*str) += 1; return TOK_MODULO; }
*len = tmp - (*str);
return TOK_VAL;
/*
Simple path expansion, expands ~/, ~user, $var. The result must be freed by the caller.
-
+
Based on jkb's staden code with some adjustments.
https://sourceforge.net/p/staden/code/HEAD/tree/staden/trunk/src/Misc/getfile.c#l123
*/
}
static void filters_cmp_filter(token_t *atok, token_t *btok, token_t *rtok, bcf1_t *line)
{
- int i;
- if ( rtok->tok_type==TOK_NOT_IN )
+ // the btok values contain FILTER ids obtained by parsing the user expression
+ int i,j;
+ if ( rtok->tok_type==TOK_NOT_IN ) // fail if the query expression is a subset of the VCF FILTER
{
- if ( !line->d.n_flt )
+ if ( !btok->nvalues ) // the query expression is ".", pass everything unless the VCF is also "."
+ {
+ if ( line->d.n_flt ) rtok->pass_site = 1;
+ return;
+ }
+ if ( !line->d.n_flt ) // no filters at this VCF line and the query expression has a value
{
- if ( atok->hdr_id==-1 ) return; // missing value
rtok->pass_site = 1;
- return; // no filter present, eval to true
+ return;
}
- for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) return;
- rtok->pass_site = 1;
+ for (j=0; j<btok->nvalues; j++) // some query expression value must be absent from VCF in order to pass
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
+ }
+ if ( j!=btok->nvalues ) rtok->pass_site = 1;
return;
}
else if ( rtok->tok_type==TOK_IN )
{
- if ( !line->d.n_flt )
+ if ( !btok->nvalues ) // the query expression is ".", fail everything unless the VCF is also "."
{
- if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
- return; // no filter present, eval to false
+ if ( !line->d.n_flt ) rtok->pass_site = 1;
+ return;
+ }
+ if ( !line->d.n_flt ) return; // no filters at this VCF line and the query expression has a value
+ for (j=0; j<btok->nvalues; j++) // all of the query values must be present in the VCF in order to pass
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
}
- for (i=0; i<line->d.n_flt; i++)
- if ( atok->hdr_id==line->d.flt[i] ) { rtok->pass_site = 1; return; }
+ if ( j==btok->nvalues ) rtok->pass_site = 1;
return;
}
- else if ( rtok->tok_type==TOK_NE ) // exact match
+ else if ( rtok->tok_type==TOK_NE ) // require anything but exact match
{
- if ( !line->d.n_flt )
+ if ( btok->nvalues != line->d.n_flt )
{
- if ( atok->hdr_id==-1 ) return; // missing value
rtok->pass_site = 1;
- return; // no filter present, eval to true
+ return;
}
- if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) return; // exact match, fail iff a single matching value is present
- rtok->pass_site = 1;
+ if ( !btok->nvalues ) return;
+ for (j=0; j<btok->nvalues; j++) // some of the query values must be absent from the VCF in order to pass
+ {
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
+ }
+ if ( j!=btok->nvalues ) rtok->pass_site = 1;
return;
}
- else if ( rtok->tok_type==TOK_EQ ) // exact match, pass iff a single matching value is present
+ else if ( rtok->tok_type==TOK_EQ ) // require exact match
{
- if ( !line->d.n_flt )
+ if ( btok->nvalues != line->d.n_flt ) return;
+ if ( !btok->nvalues )
+ {
+ rtok->pass_site = 1;
+ return;
+ }
+ for (j=0; j<btok->nvalues; j++) // all of the query values must be present in the VCF in order to pass
{
- if ( atok->hdr_id==-1 ) { rtok->pass_site = 1; return; }
- return; // no filter present, eval to false
+ for (i=0; i<line->d.n_flt; i++)
+ if ( btok->values[j]==line->d.flt[i] ) break;
+ if ( i==line->d.n_flt ) break; // the query is not in the VCF
}
- if ( line->d.n_flt==1 && atok->hdr_id==line->d.flt[0] ) rtok->pass_site = 1;
+ if ( j==btok->nvalues ) rtok->pass_site = 1;
return;
}
- else
+ else
error("Only ==, !=, ~, and !~ operators are supported for FILTER\n");
return;
}
if ( !btok->str_value.l ) error("Error occurred while evaluating the expression\n");
- if ( rtok->tok_type==TOK_EQ )
+ if ( rtok->tok_type==TOK_EQ )
rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 0 : 1;
else if ( rtok->tok_type==TOK_NE )
rtok->pass_site = strcmp(btok->str_value.s,line->d.id) ? 1 : 0;
tok->nvalues = tok->str_value.l = 0;
return;
}
-
+
int i,j, nsmpl = bcf_hdr_nsamples(flt->hdr), nvals1 = type==2 ? 3 : 4;
if ( tok->str_value.m <= nvals1*nsmpl )
{
tok->nvalues = 0;
return;
}
-
+
int j,nmissing = 0;
#define BRANCH(type_t, is_vector_end) { \
for (i=0; i<line->n_sample; i++) \
static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
{
filters_set_ac(flt,line,tok);
- tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
+ tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
tok->nvalues = 1;
}
static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
double pval = na < nb ? kf_betai(nb, na + 1, 0.5) : kf_betai(na, nb + 1, 0.5);
pval *= 2;
- assert( pval-1 < 1e-10 );
if ( pval>1 ) pval = 1; // this can happen, machine precision error, eg. kf_betai(1,0,0.5)
return pval;
if ( (atok->nsamples || btok->nsamples) && !rtok->nsamples )
{
rtok->nsamples = atok->nsamples ? atok->nsamples : btok->nsamples;
- rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
+ rtok->usmpl = (uint8_t*) calloc(rtok->nsamples,1);
int i;
for (i=0; i<atok->nsamples; i++) rtok->usmpl[i] |= atok->usmpl[i];
for (i=0; i<btok->nsamples; i++) rtok->usmpl[i] |= btok->usmpl[i];
memset(rtok->pass_samples, 0, rtok->nsamples);
}
-#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP) \
+#define VECTOR_ARITHMETICS(atok,btok,_rtok,AOP,TYPE) \
{ \
token_t *rtok = _rtok; \
int i, has_values = 0; \
continue; \
} \
has_values = 1; \
- rtok->values[i] = atok->values[i] AOP btok->values[i]; \
+ rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[i]; \
} \
} \
else if ( atok->nsamples ) \
continue; \
} \
has_values = 1; \
- rtok->values[i] = atok->values[i] AOP btok->values[0]; \
+ rtok->values[i] = TYPE atok->values[i] AOP TYPE btok->values[0]; \
} \
} \
} \
continue; \
} \
has_values = 1; \
- rtok->values[i] = atok->values[0] AOP btok->values[i]; \
+ rtok->values[i] = TYPE atok->values[0] AOP TYPE btok->values[i]; \
} \
} \
} \
for (i=0; i<rtok->nsamples; i++)
{
if ( !rtok->usmpl[i] ) continue;
- rtok->pass_samples[i] = tok->pass_samples[i];
+ rtok->pass_samples[i] = tok->pass_samples[i];
}
rtok->pass_site = 1;
return 2;
return;
}
if ( !regex )
- rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
+ rtok->pass_site = _match_vector_strings(atok->str_value.s, atok->str_value.l, btok->str_value.s, btok->str_value.l, logic, missing_logic);
else
{
token_t *tok = atok->regex ? btok : atok;
{
if ( missing_logic[2] )
{
- for (i=0; i<rtok->nsamples; i++)
+ for (i=0; i<rtok->nsamples; i++)
if ( rtok->usmpl[i] ) { rtok->pass_samples[i] = missing_logic[2]; rtok->pass_site = 1; }
}
return;
return;
}
- // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
+ // The case of (!atok->nsamples || !btok->nsamples) && (atok->nvalues && btok->nvalues)
token_t *xtok = atok->nsamples ? atok : btok;
token_t *ytok = atok->nsamples ? btok : atok;
assert( regex==ytok->regex );
if ( !list ) error("Could not read: %s\n", fname);
free(fname);
tok->nsamples = bcf_hdr_nsamples(hdr);
- tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
+ tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
for (i=0; i<nsmpl; i++)
{
int ismpl = bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,list[i]);
}
else
{
- tok->idxs = idxs1;
+ tok->idxs = idxs1;
tok->nidxs = nidxs1;
tok->idx = idx1;
}
if ( set_samples )
{
tok->nsamples = bcf_hdr_nsamples(hdr);
- tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
+ tok->usmpl = (uint8_t*) calloc(tok->nsamples,1);
if ( idx1>=0 )
{
if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
{
tok->tok_type = TOK_VAL;
tok->threshold = bcf_hdr_nsamples(filter->hdr);
+ tok->is_constant = 1;
return 0;
}
else if ( !strncasecmp(str,"N_MISSING",len) )
}
if ( is_fmt==-1 ) is_fmt = 0;
}
- if ( is_array )
+ if ( is_array )
parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
- else if ( is_fmt && !tok->nsamples )
+ else if ( is_fmt && !tok->nsamples )
{
int i;
tok->nsamples = bcf_hdr_nsamples(filter->hdr);
- tok->usmpl = (uint8_t*) malloc(tok->nsamples);
+ tok->usmpl = (uint8_t*) malloc(tok->nsamples);
for (i=0; i<tok->nsamples; i++) tok->usmpl[i] = 1;
}
case BCF_HT_STR: tok->setter = &filters_set_info_string; tok->is_str = 1; break;
default: error("[%s:%d %s] FIXME\n", __FILE__,__LINE__,__FUNCTION__);
}
- if (!is_array)
+ if (!is_array)
{
tok->idx = -2;
tok->idxs = (int*) malloc(sizeof(int));
char **env = NULL;
PERL_SYS_INIT3(&argc, &argv, &env);
}
-
+
filter->perl = perl_alloc();
PerlInterpreter *perl = filter->perl;
tok->hdr_id = -1;
tok->pass_site = -1;
tok->threshold = -1.0;
+ tok->is_constant = 1;
ret = TOK_MULT;
}
else if ( ret == -TOK_FUNC )
else if ( !strcasecmp(out[ival].key,"overlap") ) { out[ival].threshold = VCF_OVERLAP<<1; out[ival].is_str = 0; }
else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
+ out[ival].is_constant = 1;
if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
out[ival].tag = out[ival].key; out[ival].key = NULL;
i = itok;
else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( strcmp(".",out[ival].key) )
+ token_t *tok = &out[ival];
+ char *bp = tok->key;
+ tok->nvalues = 0;
+ int has_missing = 0;
+ while ( *bp )
{
- out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
- if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
- error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
+ char tmp, *ep = bp;
+ while ( *ep && *ep!=';' ) ep++;
+ tmp = *ep;
+ *ep = 0;
+ if ( !strcmp(".",bp) ) has_missing = 1;
+ else
+ {
+ tok->nvalues++;
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ int id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, bp);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,id) )
+ error("The filter \"%s\" not present in the VCF header\n", bp);
+ tok->values[tok->nvalues-1] = id;
+ }
+ *ep = tmp;
+ if ( !tmp ) break;
+ bp = ep + 1;
}
- else
- out[ival].hdr_id = -1;
- out[ival].tag = out[ival].key; out[ival].key = NULL;
- out[itok].hdr_id = out[ival].hdr_id;
+ if ( has_missing && tok->nvalues ) error("The FILTER expression cannot contain missing value AND filters: \"%s\" (%d)\n",tok->key,tok->nvalues);
+ out[ival].tag = tok->key;
+ tok->key = NULL;
+ out[itok].hdr_id = tok->hdr_id;
continue;
}
}
kputs(filter->filters[i].key, &filter->filters[i].str_value);
filter->filters[i].nvalues = filter->filters[i].str_value.l;
}
- else // numeric constant
+ else if ( filter->filters[i].is_constant ) // numeric constant
{
filter->filters[i].values[0] = filter->filters[i].threshold;
filter->filters[i].nvalues = 1;
if ( filter->filters[i].tok_type == TOK_ADD )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],+,(double));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_SUB )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],-,(double));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_MULT )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],*,(double));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
}
else if ( filter->filters[i].tok_type == TOK_DIV )
{
- VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/);
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],/,(double));
+ filter->flt_stack[nstack-2] = &filter->filters[i];
+ nstack--;
+ continue;
+ }
+ else if ( filter->filters[i].tok_type == TOK_MODULO )
+ {
+ VECTOR_ARITHMETICS(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1],&filter->filters[i],%,(int));
filter->flt_stack[nstack-2] = &filter->filters[i];
nstack--;
continue;
fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
exit(EXIT_FAILURE);
}
- if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 1)) {
fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
exit(EXIT_FAILURE);
}
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_FMT_NMBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
if ( conf->fmt_flag&B2B_INFO_SCB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
} else {
if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) )
conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR));
}
+ int nnmbz = (conf->fmt_flag&B2B_FMT_NMBZ) ? nsmpl + 1 : 1;
+ conf->bc.ref_nm = (int32_t*) malloc(sizeof(*conf->bc.ref_nm) * nnmbz * B2B_N_NM);
+ conf->bc.alt_nm = (int32_t*) malloc(sizeof(*conf->bc.alt_nm) * nnmbz * B2B_N_NM);
+ conf->bc.mwu_nm = (float*) malloc((nsmpl+1)*sizeof(*conf->bc.mwu_nm));
+ conf->bca->ref_nm = conf->bc.ref_nm; // this is just to make the arrays available in bcf_call_glfgen()
+ conf->bca->alt_nm = conf->bc.alt_nm;
+ if ( conf->fmt_flag&B2B_FMT_NMBZ )
+ {
+ for (i=0; i<nsmpl; i++) conf->bcr[i].ref_nm = conf->bc.ref_nm + (i+1)*B2B_N_NM;
+ for (i=0; i<nsmpl; i++) conf->bcr[i].alt_nm = conf->bc.alt_nm + (i+1)*B2B_N_NM;
+ }
// init mpileup
conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
free(conf->bc.ADF);
free(conf->bc.SCR);
free(conf->bc.QS);
+ free(conf->bc.ref_nm);
+ free(conf->bc.alt_nm);
free(conf->bc.fmt_arr);
+ free(conf->bc.mwu_nm);
free(conf->bcr);
}
if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
+ else if ( !strcasecmp(tags[i],"NMBZ") || !strcasecmp(tags[i],"FORMAT/NMBZ") || !strcasecmp(tags[i],"FMT/NMBZ") ) flag |= B2B_FMT_NMBZ;
else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
"\n"
"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
"\n"
-" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
-" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
-" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
-" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
-" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
-" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
-" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
+" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
+" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
+" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
"\n"
"INFO annotation tags available:\n"
"\n"
" --seed INT Random number seed used for sampling deep regions [0]\n"
"\n"
"Output options:\n"
- " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n"
+ " -a, --annotate LIST Optional tags to output; '\\?' to list available tags []\n"
" -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n"
" To minimum per-sample DP\n"
" --no-version Do not append version and command line to the header\n"
fprintf(bcftools_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
bcftools_exit(EXIT_FAILURE);
}
- if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 1)) {
fprintf(bcftools_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
bcftools_exit(EXIT_FAILURE);
}
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Base Quality Bias (closer to 0 is better)\">");
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Mapping Quality vs Strand Bias (closer to 0 is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
+ if ( conf->fmt_flag&B2B_FMT_NMBZ )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=NMBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Number of Mismatches within supporting reads (closer to 0 is better)\">");
if ( conf->fmt_flag&B2B_INFO_SCB )
bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SCBZ,Number=1,Type=Float,Description=\"Mann-Whitney U-z test of Soft-Clip Length Bias (closer to 0 is better)\">");
} else {
if ( conf->fmt_flag&(B2B_INFO_SCR|B2B_FMT_SCR) )
conf->bc.SCR = (int32_t*) malloc((nsmpl+1)*sizeof(*conf->bc.SCR));
}
+ int nnmbz = (conf->fmt_flag&B2B_FMT_NMBZ) ? nsmpl + 1 : 1;
+ conf->bc.ref_nm = (int32_t*) malloc(sizeof(*conf->bc.ref_nm) * nnmbz * B2B_N_NM);
+ conf->bc.alt_nm = (int32_t*) malloc(sizeof(*conf->bc.alt_nm) * nnmbz * B2B_N_NM);
+ conf->bc.mwu_nm = (float*) malloc((nsmpl+1)*sizeof(*conf->bc.mwu_nm));
+ conf->bca->ref_nm = conf->bc.ref_nm; // this is just to make the arrays available in bcf_call_glfgen()
+ conf->bca->alt_nm = conf->bc.alt_nm;
+ if ( conf->fmt_flag&B2B_FMT_NMBZ )
+ {
+ for (i=0; i<nsmpl; i++) conf->bcr[i].ref_nm = conf->bc.ref_nm + (i+1)*B2B_N_NM;
+ for (i=0; i<nsmpl; i++) conf->bcr[i].alt_nm = conf->bc.alt_nm + (i+1)*B2B_N_NM;
+ }
// init mpileup
conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
free(conf->bc.ADF);
free(conf->bc.SCR);
free(conf->bc.QS);
+ free(conf->bc.ref_nm);
+ free(conf->bc.alt_nm);
free(conf->bc.fmt_arr);
+ free(conf->bc.mwu_nm);
free(conf->bcr);
}
if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
else if ( !strcasecmp(tags[i],"SCR") || !strcasecmp(tags[i],"FORMAT/SCR") || !strcasecmp(tags[i],"FMT/SCR") ) flag |= B2B_FMT_SCR;
else if ( !strcasecmp(tags[i],"QS") || !strcasecmp(tags[i],"FORMAT/QS") || !strcasecmp(tags[i],"FMT/QS") ) flag |= B2B_FMT_QS;
+ else if ( !strcasecmp(tags[i],"NMBZ") || !strcasecmp(tags[i],"FORMAT/NMBZ") || !strcasecmp(tags[i],"FMT/NMBZ") ) flag |= B2B_FMT_NMBZ;
else if ( !strcasecmp(tags[i],"INFO/SCR") ) flag |= B2B_INFO_SCR;
else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
"\n"
"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
"\n"
-" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
-" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
-" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
-" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
-" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
-" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
-" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
+" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/NMBZ .. Mann-Whitney U-z test of Number of Mismatches within supporting reads (Number=1,Type=Float)\n"
+" FORMAT/QS .. Allele phred-score quality sum for use with `call -mG` and +trio-dnm (Number=R,Type=Integer)\n"
+" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+" FORMAT/SCR .. Number of soft-clipped reads (Number=1,Type=Integer)\n"
"\n"
"INFO annotation tags available:\n"
"\n"
" --seed INT Random number seed used for sampling deep regions [0]\n"
"\n"
"Output options:\n"
- " -a, --annotate LIST Optional tags to output; '?' to list available tags []\n"
+ " -a, --annotate LIST Optional tags to output; '\\?' to list available tags []\n"
" -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n"
" To minimum per-sample DP\n"
" --no-version Do not append version and command line to the header\n"
#include <errno.h>
#include <assert.h>
#include <limits.h>
-#include <zlib.h>
#include "prob1.h"
-// #include "kstring.h"
-// #include "kseq.h"
-// KSTREAM_INIT(gzFile, gzread, 16384)
-
#define MC_MAX_EM_ITER 16
#define MC_EM_EPS 1e-5
#define MC_DEF_INDEL 0.15
-gzFile bcf_p1_fp_lk;
-
void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
{
int i;
}
}
if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
- if (bcf_p1_fp_lk)
- gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
}
static void mc_cal_y(bcf_p1aux_t *ma)
#include <errno.h>
#include <assert.h>
#include <limits.h>
-#include <zlib.h>
#include "prob1.h"
-// #include "kstring.h"
-// #include "kseq.h"
-// KSTREAM_INIT(gzFile, gzread, 16384)
-
#define MC_MAX_EM_ITER 16
#define MC_EM_EPS 1e-5
#define MC_DEF_INDEL 0.15
-gzFile bcf_p1_fp_lk;
-
void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
{
int i;
}
}
if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
- if (bcf_p1_fp_lk)
- gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
}
static void mc_cal_y(bcf_p1aux_t *ma)
int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present
annot_col_t *cols; // column indexes and setters
int ncols;
- int match_id; // set iff `-c ~ID` given
- int match_end; // set iff `-c ~INFO/END` is given
+ int match_id; // set iff `-c ~ID` given, -1 otherwise
+ int match_end; // set iff `-c ~INFO/END` is given, -1 otherwise
char *set_ids_fmt;
convert_t *set_ids;
}
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
- error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
+ error("The FORMAT tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
if ( ptr )
{
*ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
- error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
+ error("The INFO tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
}
- error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
+ error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
}
tmp.l = 0;
bcf_hrec_format_rename(hrec, key_dst, &tmp);
hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname);
+ error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
if ( args->tgts_is_vcf )
else if ( !strncasecmp("fmt/",ori_tag,4) ) type = BCF_HL_FMT, ori_tag += 4;
else if ( !strncasecmp("filter/",ori_tag,7) ) type = BCF_HL_FLT, ori_tag += 7;
else return -1;
+ if ( !strncasecmp("info/",new_tag,5) )
+ {
+ if ( type != BCF_HL_INFO ) error("Cannot transfer %s to INFO\n", ori_tag);
+ new_tag += 5;
+ }
+ else if ( !strncasecmp("format/",new_tag,7) )
+ {
+ if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag);
+ new_tag += 7;
+ }
+ else if ( !strncasecmp("fmt/",new_tag,4) )
+ {
+ if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag);
+ new_tag += 4;
+ }
+ else if ( !strncasecmp("filter/",new_tag,7) )
+ {
+ if ( type != BCF_HL_FLT ) error("Cannot transfer %s to FILTER\n", ori_tag);
+ new_tag += 7;
+ }
int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, ori_tag);
if ( id<0 ) return 1;
bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL);
ialt++;
}
if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
- if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
+ if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
has_overlap = 1;
break;
args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
args->set_ids_replace = 1;
args->match_id = -1;
+ args->match_end = -1;
args->clevel = -1;
args->pair_logic = -1;
int regions_is_file = 0;
int ref_idx, alt_idx, chr_idx, beg_idx, end_idx; // -1 if not present
annot_col_t *cols; // column indexes and setters
int ncols;
- int match_id; // set iff `-c ~ID` given
- int match_end; // set iff `-c ~INFO/END` is given
+ int match_id; // set iff `-c ~ID` given, -1 otherwise
+ int match_end; // set iff `-c ~INFO/END` is given, -1 otherwise
char *set_ids_fmt;
convert_t *set_ids;
}
int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
- error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
+ error("The FORMAT tag \"%s\" is not defined in %s, was the -h option provided?\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
memset(col,0,sizeof(*col));
if ( ptr )
{
*ptr = 0; tmp.l = 0; ksprintf(&tmp,"%s:=%s",key_src,ptr+1); *ptr = '=';
- error("The tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
+ error("The INFO tag \"%s\" is not defined, is this what you want \"%s\" ?\n",key_src,tmp.s);
}
- error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
+ error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_src,args->files->readers[1].fname);
}
tmp.l = 0;
bcf_hrec_format_rename(hrec, key_dst, &tmp);
hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname);
+ error("The INFO tag \"%s\" is not defined in %s, was the -h option provided?\n", key_dst, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
if ( args->tgts_is_vcf )
else if ( !strncasecmp("fmt/",ori_tag,4) ) type = BCF_HL_FMT, ori_tag += 4;
else if ( !strncasecmp("filter/",ori_tag,7) ) type = BCF_HL_FLT, ori_tag += 7;
else return -1;
+ if ( !strncasecmp("info/",new_tag,5) )
+ {
+ if ( type != BCF_HL_INFO ) error("Cannot transfer %s to INFO\n", ori_tag);
+ new_tag += 5;
+ }
+ else if ( !strncasecmp("format/",new_tag,7) )
+ {
+ if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag);
+ new_tag += 7;
+ }
+ else if ( !strncasecmp("fmt/",new_tag,4) )
+ {
+ if ( type != BCF_HL_FMT ) error("Cannot transfer %s to FORMAT\n", ori_tag);
+ new_tag += 4;
+ }
+ else if ( !strncasecmp("filter/",new_tag,7) )
+ {
+ if ( type != BCF_HL_FLT ) error("Cannot transfer %s to FILTER\n", ori_tag);
+ new_tag += 7;
+ }
int id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, ori_tag);
if ( id<0 ) return 1;
bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->hdr_out, type, "ID", ori_tag, NULL);
ialt++;
}
if ( args->match_id>=0 && !strstr_match(line->d.id,args->alines[i].cols[args->match_id]) ) continue;
- if ( match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
+ if ( args->match_end>=0 && match_end.l && strcmp(match_end.s,args->alines[i].cols[args->match_end]) ) continue;
args->srt_alines[args->nsrt_alines++] = (ialt<<16) | i;
has_overlap = 1;
break;
args->ref_idx = args->alt_idx = args->chr_idx = args->beg_idx = args->end_idx = -1;
args->set_ids_replace = 1;
args->match_id = -1;
+ args->match_end = -1;
args->clevel = -1;
args->pair_logic = -1;
int regions_is_file = 0;
/* The MIT License
- Copyright (c) 2016-2021 Genome Research Ltd.
+ Copyright (c) 2016-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
struct _vcfbuf_t
{
- int win;
+ int win, dummy;
bcf_hdr_t *hdr;
vcfrec_t *vcf;
rbuf_t rbuf;
if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
+ if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; }
if ( key==VCFBUF_NSITES )
{
buf->prune.max_sites = *((int*)value);
else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
else error("The mode \"%s\" is not recognised\n",mode);
+ return;
}
}
int i = rbuf_append(&buf->rbuf);
if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
-
+
bcf1_t *ret = buf->vcf[i].rec;
buf->vcf[i].rec = rec;
buf->vcf[i].af_set = 0;
for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
{
bcf1_t *line = buf->vcf[i].rec;
- if ( line->n_allele > buf->prune.mac )
+ if ( line->n_allele > buf->prune.mac )
{
buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac));
buf->prune.mac = line->n_allele;
}
else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) )
{
- int ntot = buf->prune.ac[0], nalt = 0;
+ int ntot = buf->prune.ac[0], nalt = 0;
for (k=1; k<line->n_allele; k++) nalt += buf->prune.ac[k];
buf->vcf[i].af = ntot ? (float)nalt/ntot : 0;
}
{
buf->overlap.rid = last->rec->rid;
buf->overlap.end = end_pos;
- return 0;
+ return 0;
}
if ( beg_pos <= buf->overlap.end )
{
int i,j;
if ( buf->rbuf.n==0 ) return NULL;
- if ( flush_all ) goto ret;
+ if ( flush_all || buf->dummy ) goto ret;
i = rbuf_kth(&buf->rbuf, 0); // first
j = rbuf_last(&buf->rbuf); // last
else if ( buf->win < 0 )
{
if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
+ goto ret;
}
- else return NULL;
-
+ else
+ return NULL;
+
ret:
if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage
- disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265
+ disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265
\hat{D} = 1/[n*(n+1)]*[
(n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9)
double nhd[] = {0,0,0,0,0,0,0,0,0};
double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
int nab = 0, ndiff = 0;
- int an_tot = 0, bn_tot = 0;
+ int an_tot = 0, bn_tot = 0;
for (i=0; i<arec->n_sample; i++)
{
int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output
ld->val[VCFBUF_LD_IDX_HD] =
- (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8])
+ (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8])
- (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.);
ld->val[VCFBUF_LD_IDX_HD] /= nab;
ld->val[VCFBUF_LD_IDX_HD] /= nab+1;
}
for (i=-1; rbuf_next(&buf->rbuf,&i); )
- {
+ {
if ( buf->vcf[i].filter ) continue;
if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes
/* The MIT License
- Copyright (c) 2016-2021 Genome Research Ltd.
+ Copyright (c) 2016-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
struct _vcfbuf_t
{
- int win;
+ int win, dummy;
bcf_hdr_t *hdr;
vcfrec_t *vcf;
rbuf_t rbuf;
if ( key==LD_MAX_LD ) { buf->ld.max[VCFBUF_LD_IDX_LD] = *((double*)value); return; }
if ( key==LD_MAX_HD ) { buf->ld.max[VCFBUF_LD_IDX_HD] = *((double*)value); return; }
+ if ( key==VCFBUF_DUMMY ) { buf->dummy = *((int*)value); return; }
if ( key==VCFBUF_NSITES )
{
buf->prune.max_sites = *((int*)value);
else if ( !strcasecmp(mode,"1st") ) buf->prune.mode = PRUNE_MODE_1ST;
else if ( !strcasecmp(mode,"rand") ) buf->prune.mode = PRUNE_MODE_RAND;
else error("The mode \"%s\" is not recognised\n",mode);
+ return;
}
}
int i = rbuf_append(&buf->rbuf);
if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
-
+
bcf1_t *ret = buf->vcf[i].rec;
buf->vcf[i].rec = rec;
buf->vcf[i].af_set = 0;
for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; )
{
bcf1_t *line = buf->vcf[i].rec;
- if ( line->n_allele > buf->prune.mac )
+ if ( line->n_allele > buf->prune.mac )
{
buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac));
buf->prune.mac = line->n_allele;
}
else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) )
{
- int ntot = buf->prune.ac[0], nalt = 0;
+ int ntot = buf->prune.ac[0], nalt = 0;
for (k=1; k<line->n_allele; k++) nalt += buf->prune.ac[k];
buf->vcf[i].af = ntot ? (float)nalt/ntot : 0;
}
{
buf->overlap.rid = last->rec->rid;
buf->overlap.end = end_pos;
- return 0;
+ return 0;
}
if ( beg_pos <= buf->overlap.end )
{
int i,j;
if ( buf->rbuf.n==0 ) return NULL;
- if ( flush_all ) goto ret;
+ if ( flush_all || buf->dummy ) goto ret;
i = rbuf_kth(&buf->rbuf, 0); // first
j = rbuf_last(&buf->rbuf); // last
else if ( buf->win < 0 )
{
if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
+ goto ret;
}
- else return NULL;
-
+ else
+ return NULL;
+
ret:
if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);
D =~ (GT correlation) * sqrt(Pa*(1-Pa)*Pb*(1-Pb))
and `hd` as proposed in Ragsdale, A. P., & Gravel, S. (2019). Unbiased estimation of linkage
- disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265
+ disequilibrium from unphased data. Molecular Biology and Evolution. doi:10.1093/molbev/msz265
\hat{D} = 1/[n*(n+1)]*[
(n1 + n2/2 + n4/2 + n5/4)*(n5/4 + n6/2 + n8/2 + n9)
double nhd[] = {0,0,0,0,0,0,0,0,0};
double ab = 0, aa = 0, bb = 0, a = 0, b = 0;
int nab = 0, ndiff = 0;
- int an_tot = 0, bn_tot = 0;
+ int an_tot = 0, bn_tot = 0;
for (i=0; i<arec->n_sample; i++)
{
int8_t *aptr = (int8_t*) (afmt->p + i*afmt->size);
ld->val[VCFBUF_LD_IDX_LD] = fabs(ld->val[VCFBUF_LD_IDX_LD]); // avoid "-0" on output
ld->val[VCFBUF_LD_IDX_HD] =
- (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8])
+ (nhd[0] + nhd[1]/2. + nhd[3]/2. + nhd[4]/4.)*(nhd[4]/4. + nhd[5]/2. + nhd[7]/2. + nhd[8])
- (nhd[1]/2. + nhd[2] + nhd[4]/4. + nhd[5]/2.)*(nhd[3]/2. + nhd[4]/4. + nhd[6] + nhd[7]/2.);
ld->val[VCFBUF_LD_IDX_HD] /= nab;
ld->val[VCFBUF_LD_IDX_HD] /= nab+1;
}
for (i=-1; rbuf_next(&buf->rbuf,&i); )
- {
+ {
if ( buf->vcf[i].filter ) continue;
if ( _calc_r2_ld(buf, buf->vcf[i].rec, rec, &tmp) < 0 ) continue; // missing genotypes
/* The MIT License
- Copyright (c) 2017-2021 Genome Research Ltd.
+ Copyright (c) 2017-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
-
+
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// Modes of operation
typedef enum
{
+ VCFBUF_DUMMY, // the caller maintains the buffer via push/peek/flush, nothing is removed by vcfbuf
+
VCFBUF_OVERLAP_WIN, // keep only overlapping variants in the window
VCFBUF_RMDUP, // remove duplicate sites (completely)
VCFBUF_NSITES, // leave at max this many sites in the window
LD_FILTER1, // exclude the next record inserted by vcfbuf_push() from LD analysis
LD_MAX_R2, // If set, vcfbuf_ld() will stop at the first record that exceeds the R2,
LD_MAX_LD, // LD, or HD threshold. When multiple are set, the OR logic is applied
- LD_MAX_HD, //
+ LD_MAX_HD, //
}
vcfbuf_opt_t;
/*
* vcfbuf_init() - init buffer
- * @win: number of sites (>0) or bp (<0)
+ * @win: number of sites (>0), bp (<0)
*/
vcfbuf_t *vcfbuf_init(bcf_hdr_t *hdr, int win);
void vcfbuf_destroy(vcfbuf_t *buf);
*/
bcf1_t *vcfbuf_remove(vcfbuf_t *buf, int idx);
+/*
+ * vcfbuf_flush() - returns the next record or NULL, depending on the mode of operation and
+ * the content of the buffer
+ */
bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all);
/*
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <math.h>
#include <htslib/vcf.h>
#include <time.h>
-#include <zlib.h>
#include <stdarg.h>
#include <htslib/kfunc.h>
#include <htslib/synced_bcf_reader.h>
}
tmp++;
}
- if ( j!=5 ) break;
+ if ( j<4 ) break;
+
+ char sex;
+ if ( col_ends[3][1]=='1' ) sex = 'M';
+ else if ( col_ends[3][1]=='2' ) sex = 'F';
+ else break;
- char sex = col_ends[3][1]=='1' ? 'M' : 'F';
lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
{
free(fam_str.s);
khash_str2int_destroy_free(name2idx);
- if ( i!=nvals ) // not a ped file
- {
- if ( i>0 ) error("Could not parse samples, not a PED format.\n");
- return NULL;
- }
+ if ( i!=nvals ) return NULL; // not a ped file
+
*nsmpl = nlines;
return lines;
}
lines = smpls;
nlines = nsmpls;
}
+ else if ( is_file )
+ fprintf(stderr,"Note: could not parse as PED: %s\n",fn);
args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
return NULL;
}
- // Find the VCF and tab record with the best matching combination of alleles, prioritize
+ // Find the VCF and tab record with the best matching combination of alleles, prioritize
// records of the same type (snp vs indel)
rec_tgt_t rec_tgt;
memset(&rec_tgt,0,sizeof(rec_tgt));
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
- for (i=0; i<args->nsamples; i++)
+ for (i=0; i<args->nsamples; i++)
if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
}
fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
- fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
+ fprintf(stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
fprintf(stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
fprintf(stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
*args.aux.prior_AC = 0;
args.aux.prior_AC++;
break;
- case 'g':
+ case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
break;
/* vcfcall.c -- SNP/indel variant calling from VCF/BCF.
- Copyright (C) 2013-2021 Genome Research Ltd.
+ Copyright (C) 2013-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <math.h>
#include <htslib/vcf.h>
#include <time.h>
-#include <zlib.h>
#include <stdarg.h>
#include <htslib/kfunc.h>
#include <htslib/synced_bcf_reader.h>
}
tmp++;
}
- if ( j!=5 ) break;
+ if ( j<4 ) break;
+
+ char sex;
+ if ( col_ends[3][1]=='1' ) sex = 'M';
+ else if ( col_ends[3][1]=='2' ) sex = 'F';
+ else break;
- char sex = col_ends[3][1]=='1' ? 'M' : 'F';
lines = add_sample(name2idx, lines, &nlines, &mlines, col_ends[0]+1, sex, &j);
if ( strcmp(col_ends[1]+1,"0") && strcmp(col_ends[2]+1,"0") ) // father and mother
{
free(fam_str.s);
khash_str2int_destroy_free(name2idx);
- if ( i!=nvals ) // not a ped file
- {
- if ( i>0 ) error("Could not parse samples, not a PED format.\n");
- return NULL;
- }
+ if ( i!=nvals ) return NULL; // not a ped file
+
*nsmpl = nlines;
return lines;
}
lines = smpls;
nlines = nsmpls;
}
+ else if ( is_file )
+ fprintf(bcftools_stderr,"Note: could not parse as PED: %s\n",fn);
args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
return NULL;
}
- // Find the VCF and tab record with the best matching combination of alleles, prioritize
+ // Find the VCF and tab record with the best matching combination of alleles, prioritize
// records of the same type (snp vs indel)
rec_tgt_t rec_tgt;
memset(&rec_tgt,0,sizeof(rec_tgt));
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
- for (i=0; i<args->nsamples; i++)
+ for (i=0; i<args->nsamples; i++)
if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
}
fprintf(bcftools_stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
fprintf(bcftools_stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
fprintf(bcftools_stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
- fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
+ fprintf(bcftools_stderr, " This requires FORMAT/QS or other Number=R,Type=Integer tag such as FORMAT/AD\n");
fprintf(bcftools_stderr, " --group-samples-tag TAG The tag to use with -G, by default FORMAT/QS and FORMAT/AD are checked automatically\n");
fprintf(bcftools_stderr, " -g, --gvcf INT,[...] Group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(bcftools_stderr, " -i, --insert-missed Output also sites missed by mpileup but present in -T\n");
*args.aux.prior_AC = 0;
args.aux.prior_AC++;
break;
- case 'g':
+ case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
break;
if ( args->gen_3N6 )
{
tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP");
- tsv_register(tsv, "CHROM", tsv_setter_chrom, args);
+ tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header);
}
else
tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP");
if ( args->gen_3N6 )
{
tsv = tsv_init("CHROM,CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP");
- tsv_register(tsv, "CHROM", tsv_setter_chrom, args);
+ tsv_register(tsv, "CHROM", tsv_setter_chrom, args->header);
}
else
tsv = tsv_init("CHROM_POS_REF_ALT,ID,POS,REF_ALT,GT_GP");
enum {
per_contig = 1,
- total = 2
+ all_contigs = 2,
+ total = 4
};
static void usage(void)
fprintf(stderr, " --threads INT use multithreading with INT worker threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Stats options:\n");
+ fprintf(stderr, " -a, --all with --stats, print stats for all contigs even when zero\n");
fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n");
fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(stderr, "\n");
for (tid=0; tid<nseq; tid++)
{
uint64_t records, v;
- hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
+ int ret = hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
sum += records;
- if ( (stats&total) || !records ) continue;
+ if ( (stats&total) || (records == 0 && !(stats&all_contigs)) ) continue;
const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a";
bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- printf("%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
+ printf("%s\t%s\t", ctg_name, hkey<0?".":hrec->vals[hkey]);
+ if (ret >= 0) printf("%" PRIu64 "\n", records);
+ else printf(".\n");
}
if ( !sum )
{
static struct option loptions[] =
{
+ {"all",no_argument,NULL,'a'},
{"csi",no_argument,NULL,'c'},
{"tbi",no_argument,NULL,'t'},
{"force",no_argument,NULL,'f'},
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0)
{
switch (c)
{
break;
case 's': stats |= per_contig; break;
case 'n': stats |= total; break;
+ case 'a': stats |= all_contigs; break;
case 9:
n_threads = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
enum {
per_contig = 1,
- total = 2
+ all_contigs = 2,
+ total = 4
};
static void usage(void)
fprintf(bcftools_stderr, " --threads INT use multithreading with INT worker threads [0]\n");
fprintf(bcftools_stderr, "\n");
fprintf(bcftools_stderr, "Stats options:\n");
+ fprintf(bcftools_stderr, " -a, --all with --stats, print stats for all contigs even when zero\n");
fprintf(bcftools_stderr, " -n, --nrecords print number of records based on existing index file\n");
fprintf(bcftools_stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(bcftools_stderr, "\n");
for (tid=0; tid<nseq; tid++)
{
uint64_t records, v;
- hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
+ int ret = hts_idx_get_stat(tbx ? tbx->idx : idx, tid, &records, &v);
sum += records;
- if ( (stats&total) || !records ) continue;
+ if ( (stats&total) || (records == 0 && !(stats&all_contigs)) ) continue;
const char *ctg_name = tbx ? seq[tid] : hdr ? bcf_hdr_id2name(hdr, tid) : "n/a";
bcf_hrec_t *hrec = hdr ? bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", ctg_name, NULL) : NULL;
int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(bcftools_stdout, "%s\t%s\t%" PRIu64 "\n", ctg_name, hkey<0?".":hrec->vals[hkey], records);
+ fprintf(bcftools_stdout, "%s\t%s\t", ctg_name, hkey<0?".":hrec->vals[hkey]);
+ if (ret >= 0) fprintf(bcftools_stdout, "%" PRIu64 "\n", records);
+ else fprintf(bcftools_stdout, ".\n");
}
if ( !sum )
{
static struct option loptions[] =
{
+ {"all",no_argument,NULL,'a'},
{"csi",no_argument,NULL,'c'},
{"tbi",no_argument,NULL,'t'},
{"force",no_argument,NULL,'f'},
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:snao:", loptions, NULL)) >= 0)
{
switch (c)
{
break;
case 's': stats |= per_contig; break;
case 'n': stats |= total; break;
+ case 'a': stats |= all_contigs; break;
case 9:
n_threads = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
{
if ( !args->write ) args->write = (int*) calloc(args->files->nreaders,sizeof(int));
if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files);
- if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files);
+ if ( i<=0 || i>args->files->nreaders ) error("The index is out of range: %d (-w %s)\n", i, args->write_files);
args->write[i-1] = 1;
args->iwrite = i-1;
args->nwrite++;
args->isec_op = OP_VENN;
if ( !args->prefix ) error("Expected the -p option\n");
}
- if ( !args->targets_list )
+ if ( !args->isec_op )
{
- if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n");
- if ( !args->isec_op ) error("One of the options --complement, --nfiles or --targets must be given with more than two files\n");
+ args->isec_op = OP_PLUS;
+ args->isec_n = 1;
}
args->files->require_index = 1;
while (optind<argc)
{
if ( !args->write ) args->write = (int*) calloc(args->files->nreaders,sizeof(int));
if ( sscanf(p,"%d",&i)!=1 ) error("Could not parse --write %s\n", args->write_files);
- if ( i<0 || i>args->files->nreaders ) error("The index is out of range: %d (%s)\n", i, args->write_files);
+ if ( i<=0 || i>args->files->nreaders ) error("The index is out of range: %d (-w %s)\n", i, args->write_files);
args->write[i-1] = 1;
args->iwrite = i-1;
args->nwrite++;
args->isec_op = OP_VENN;
if ( !args->prefix ) error("Expected the -p option\n");
}
- if ( !args->targets_list )
+ if ( !args->isec_op )
{
- if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n");
- if ( !args->isec_op ) error("One of the options --complement, --nfiles or --targets must be given with more than two files\n");
+ args->isec_op = OP_PLUS;
+ args->isec_n = 1;
}
args->files->require_index = 1;
while (optind<argc)
#define DBG 0
+#define COLLAPSE_SNP_INS_DEL (1<<10)
+
#include <htslib/khash.h>
KHASH_MAP_INIT_STR(strdict, int)
typedef khash_t(strdict) strdict_t;
info_rule_t *rule = &args->rules[n];
rule->hdr_tag = strdup(ss);
int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
- if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+ if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The INFO tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
}
return 0;
}
-static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
+// to accommodate for VCF_GVCF_REF defined below
+static const int
+ snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
+ indel_mask = VCF_INDEL<<2,
+ ins_mask = VCF_INS<<2,
+ del_mask = VCF_DEL<<2,
+ ref_mask = 2;
/*
Check incoming lines for new gVCF blocks, set pointer to the current source
id = line->d.id;
else
{
- int var_type = bcf_get_variant_types(line);
+ int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
+ if (var_type < 0) error("bcf_has_variant_types() failed.");
+ if ( args->collapse==COLLAPSE_SNP_INS_DEL )
+ {
+ // need to distinguish between ins and del so strip the VCF_INDEL flag
+ var_type &= ~VCF_INDEL;
+ }
maux->var_types |= var_type ? var_type<<2 : 2;
// for the `-m none -g` mode
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
- int line_type = bcf_get_variant_types(line);
+ int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
+ if (line_type < 0) error("bcf_has_variant_types() failed.");
line_type = line_type ? line_type<<2 : 2;
// select relevant lines
// - SNPs+SNPs+MNPs+REF if -m both,snps
// - indels+indels+REF if -m both,indels, REF only if SNPs are not present
// - SNPs come first
- if ( line_type & indel_mask )
+ if ( line_type & (indel_mask|ins_mask|del_mask) )
{
if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
{
if ( buf->rec[j].skip ) continue; // done or not compatible
if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
- int line_type = bcf_get_variant_types(buf->lines[j]);
+ int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
+ if (line_type < 0) error("bcf_has_variant_types() failed.");
if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
if ( line_type==VCF_REF )
{
if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( maux->var_types&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
if ( maux->var_types&ref_mask ) break;
}
else if ( maux->var_types&ref_mask )
{
if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
}
}
}
fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list FILE Read file names from the file\n");
fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
- fprintf(stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
else error("The -m type \"%s\" is not recognised.\n", optarg);
break;
#define DBG 0
+#define COLLAPSE_SNP_INS_DEL (1<<10)
+
#include <htslib/khash.h>
KHASH_MAP_INIT_STR(strdict, int)
typedef khash_t(strdict) strdict_t;
info_rule_t *rule = &args->rules[n];
rule->hdr_tag = strdup(ss);
int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
- if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
+ if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The INFO tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
}
return 0;
}
-static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
+
+// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
+// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
+// to accommodate for VCF_GVCF_REF defined below
+static const int
+ snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
+ indel_mask = VCF_INDEL<<2,
+ ins_mask = VCF_INS<<2,
+ del_mask = VCF_DEL<<2,
+ ref_mask = 2;
/*
Check incoming lines for new gVCF blocks, set pointer to the current source
id = line->d.id;
else
{
- int var_type = bcf_get_variant_types(line);
+ int var_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
+ if (var_type < 0) error("bcf_has_variant_types() failed.");
+ if ( args->collapse==COLLAPSE_SNP_INS_DEL )
+ {
+ // need to distinguish between ins and del so strip the VCF_INDEL flag
+ var_type &= ~VCF_INDEL;
+ }
maux->var_types |= var_type ? var_type<<2 : 2;
// for the `-m none -g` mode
bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
- int line_type = bcf_get_variant_types(line);
+ int line_type = bcf_has_variant_types(line, VCF_ANY, bcf_match_overlap);
+ if (line_type < 0) error("bcf_has_variant_types() failed.");
line_type = line_type ? line_type<<2 : 2;
// select relevant lines
// - SNPs+SNPs+MNPs+REF if -m both,snps
// - indels+indels+REF if -m both,indels, REF only if SNPs are not present
// - SNPs come first
- if ( line_type & indel_mask )
+ if ( line_type & (indel_mask|ins_mask|del_mask) )
{
if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
{
if ( buf->rec[j].skip ) continue; // done or not compatible
if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
- int line_type = bcf_get_variant_types(buf->lines[j]);
+ int line_type = bcf_has_variant_types(buf->lines[j], VCF_ANY, bcf_match_overlap);
+ if (line_type < 0) error("bcf_has_variant_types() failed.");
if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
if ( line_type==VCF_REF )
{
if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( maux->var_types&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
if ( maux->var_types&ref_mask ) break;
}
else if ( maux->var_types&ref_mask )
{
if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
+ if ( line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
}
}
}
fprintf(bcftools_stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(bcftools_stderr, " -l, --file-list FILE Read file names from the file\n");
fprintf(bcftools_stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
- fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
+ fprintf(bcftools_stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
fprintf(bcftools_stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
fprintf(bcftools_stderr, " --no-version Do not append version and command line to the header\n");
fprintf(bcftools_stderr, " -o, --output FILE Write output to a file [standard output]\n");
else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
+ else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
else error("The -m type \"%s\" is not recognised.\n", optarg);
break;
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
for (i=1; i<line->n_allele; i++)
{
if ( args->first_allele_only && i>1 ) break;
- if ( bcf_get_variant_type(line,i)!=VCF_INDEL ) continue;
- int len = line->d.var[i].n;
+ int is_indel = bcf_has_variant_type(line,i,VCF_INDEL);
+ if (is_indel < 0) error("bcf_has_variant_type() failed.");
+ if ( !is_indel ) continue;
+ int len = bcf_variant_length(line, i);
#if IRC_STATS
// Indel repeat consistency
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2021 Genome Research Ltd.
+ Copyright (C) 2012-2022 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
for (i=1; i<line->n_allele; i++)
{
if ( args->first_allele_only && i>1 ) break;
- if ( bcf_get_variant_type(line,i)!=VCF_INDEL ) continue;
- int len = line->d.var[i].n;
+ int is_indel = bcf_has_variant_type(line,i,VCF_INDEL);
+ if (is_indel < 0) error("bcf_has_variant_type() failed.");
+ if ( !is_indel ) continue;
+ int len = bcf_variant_length(line, i);
#if IRC_STATS
// Indel repeat consistency
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.15.1
+VERSION=1.16
# If we have a git clone, then check against the current tag
if [ -e .git ]
locate("version.sh", srcdir, exclude_htslib=True))
if dest == "htslib":
- # Add build files, including *.ac *.in *.mk *.m4
+ # Add build files, including *.ac *.in *.mk *.m4 *.sh
mfiles = itertools.chain(mfiles, locate("Makefile", srcdir),
locate("configure", srcdir),
- locate("*.[aim][cnk4]", srcdir))
+ locate("*.[aims][cnk4h]", srcdir, exclude))
ncopied = 0
# All configuration values have a default; values that are commented out
# serve to show the default.
-import sys, os, sysconfig
+import sys, os, setuptools
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-_pyversion = sysconfig.get_python_version()
-_libdir = "../build/lib.%s-%s" % (sysconfig.get_platform(), _pyversion)
+_build_obj = setuptools.dist.Distribution().get_command_obj('build')
+_build_obj.ensure_finalized()
+
+_libdir = os.path.join('..', _build_obj.build_platlib)
if os.path.exists(_libdir):
sys.path.insert(0, os.path.abspath(_libdir))
'sphinx.ext.intersphinx',
'sphinx.ext.napoleon']
-intersphinx_mapping = {'python': ('https://docs.python.org/%s' % _pyversion, None)}
+intersphinx_mapping = {'python': ('https://docs.python.org/%d.%d' % sys.version_info[:2], None)}
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
using cython and a high-level, pythonic API for convenient access to
the data within genomic file formats.
-The current version wraps *htslib-1.15.1*, *samtools-1.15.1*, and *bcftools-1.15.1*.
+The current version wraps *htslib-1.16*, *samtools-1.16.1*, and *bcftools-1.16*.
To install the latest release, type::
) -> None: ...
def has_tag(self, tag: str) -> bool: ...
@overload
- def get_tag(self, tag: str, with_value_type: Literal[False]) -> TagValue: ...
+ def get_tag(self, tag: str, with_value_type: Literal[False] = ...) -> TagValue: ...
@overload
- def get_tag(self, tag, with_value_type: Literal[True]) -> Tuple[TagValue, str]: ...
+ def get_tag(
+ self, tag: str, with_value_type: Literal[True]
+ ) -> Tuple[TagValue, str]: ...
@overload
def get_tag(
- self, tag, with_value_type: bool = ...
+ self, tag: str, with_value_type: bool
) -> Union[TagValue, Tuple[TagValue, str]]: ...
@overload
def get_tags(
- self, with_value_type: Literal[False]
+ self, with_value_type: Literal[False] = ...
) -> List[Tuple[str, TagValue]]: ...
@overload
def get_tags(
self, with_value_type: Literal[True]
) -> List[Tuple[str, TagValue, str]]: ...
@overload
+ def get_tags(
+ self, with_value_type: bool
+ ) -> Union[List[Tuple[str, TagValue]], List[Tuple[str, TagValue, str]]]: ...
+ @overload
def get_tags(
self, with_value_type: bool = ...
) -> Union[List[Tuple[str, TagValue, str]], List[Tuple[str, TagValue]]]: ...
class IteratorRow:
def __iter__(self) -> IteratorRow: ...
- def __next__(self) -> AlignedSegment: ...
+ def __next__(self) -> PileupColumn: ...
class IteratorRowAll(IteratorRow): ...
class IteratorRowAllRefs(IteratorRow): ...
class IteratorColumn:
def __iter__(self) -> IteratorRow: ...
- def __next__(self) -> AlignedSegment: ...
+ def __next__(self) -> PileupColumn: ...
@property
def seq_len(self) -> int: ...
def add_reference(self, fastafile: FastaFile) -> None: ...
def add(
self,
id: str,
- number: Optional[str],
+ number: Optional[Union[int, str]],
type: Optional[str],
description: str,
**kwargs
contig: Optional[str] = ...,
start: int = ...,
stop: int = ...,
- alleles: Optional[Tuple[str]] = ...,
+ alleles: Optional[Tuple[str, ...]] = ...,
id: Optional[str] = ...,
qual: Optional[int] = ...,
filter: Optional[Any] = ...,
info: Optional[Mapping[str, _InfoValue]] = ...,
- samples: Optional[Iterable[str]] = ...,
+ samples: Optional[Iterable[Optional[Mapping[str, _FormatValue]]]] = ...,
**kwargs
) -> VariantRecord: ...
def add_record(self, record: VariantHeaderRecord) -> None: ...
qual: Optional[int]
id: Optional[str]
ref: Optional[str]
- alleles: Optional[Tuple[str]]
- alts: Optional[Tuple[str]]
+ alleles: Optional[Tuple[str, ...]]
+ alts: Optional[Tuple[str, ...]]
@property
def filter(self) -> VariantRecordFilter: ...
@property
def index(self) -> int: ...
@property
def name(self) -> str: ...
- allele_indices: Optional[Tuple[Optional[int]]]
- alleles: Optional[Tuple[Optional[str]]]
+ allele_indices: Optional[Tuple[Optional[int, ...]]]
+ alleles: Optional[Tuple[Optional[str, ...]]]
phased: bool
def __setitem__(self, key: str, value: _FormatValue) -> None: ...
def __delitem__(self, key: str) -> None: ...
return bcf_format_get_alleles(self)
@alleles.setter
- def alleles(self, value):
- self['GT'] = value
+ def alleles(self, value: tuple):
+ # Sets the genotype, supply a tuple of alleles to set.
+ # The supplied alleles need to be defined in the correspoding pysam.libcbcf.VariantRecord
+ # The genotype is reset when an empty tuple, None or (None,) is supplied
+
+ if value==(None,) or value==tuple() or value is None:
+ self['GT'] = ()
+ return
+
+ if any((type(x) == int for x in value)):
+ raise ValueError('Use .allele_indices to set integer allele indices')
+
+ # determine and set allele indices:
+ try:
+ self['GT'] = tuple( (self.record.alleles.index(allele) for allele in value) )
+ except ValueError:
+ raise ValueError("One or more of the supplied sample alleles are not defined as alleles of the corresponding pysam.libcbcf.VariantRecord."
+ "First set the .alleles of this record to define the alleles")
@alleles.deleter
def alleles(self):
"merge": ("merge", None),
"markdup": ("markdup", None),
"rmdup": ("rmdup", None),
+ "reference": ("reference", None),
"reheader": ("reheader", None),
"cat": ("cat", None),
"targetcut": ("targetcut", None),
// Version information used while compiling samtools, bcftools, and htslib
-#define SAMTOOLS_VERSION "1.15.1 (pysam)"
-#define BCFTOOLS_VERSION "1.15.1 (pysam)"
-#define HTS_VERSION_TEXT "1.15.1 (pysam)"
+#define SAMTOOLS_VERSION "1.16.1 (pysam)"
+#define BCFTOOLS_VERSION "1.16 (pysam)"
+#define HTS_VERSION_TEXT "1.16 (pysam)"
# pysam versioning information
-__version__ = "0.19.1"
+__version__ = "0.20.0"
-__samtools_version__ = "1.15.1"
-__bcftools_version__ = "1.15.1"
-__htslib_version__ = "1.15.1"
+__samtools_version__ = "1.16.1"
+__bcftools_version__ = "1.16"
+__htslib_version__ = "1.16"
The typical simple case of building Samtools using the HTSlib bundled within
this Samtools release tarball is done as follows:
- cd .../samtools-1.15.1 # Within the unpacked release directory
+ cd .../samtools-1.16.1 # Within the unpacked release directory
./configure
make
installation using the HTSlib bundled within this Samtools release tarball,
and building the various HTSlib utilities such as bgzip is done as follows:
- cd .../samtools-1.15.1 # Within the unpacked release directory
+ cd .../samtools-1.16.1 # Within the unpacked release directory
./configure --prefix=/path/to/location
make all all-htslib
make install install-htslib
To build with plug-ins, you need to use the --enable-plugins configure option
as follows:
- cd .../samtools-1.15.1 # Within the unpacked release directory
+ cd .../samtools-1.16.1 # Within the unpacked release directory
./configure --enable-plugins --prefix=/path/to/location
make all all-htslib
make install install-htslib
the source distribution instead of installing the package. In that case
you can use:
- cd .../samtools-1.15.1 # Within the unpacked release directory
- ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.15.1
+ cd .../samtools-1.16.1 # Within the unpacked release directory
+ ./configure --enable-plugins --with-plugin-path=$PWD/htslib-1.16
make all all-htslib
It is possible to override the built-in search path using the HTS_PATH
/* bam.c -- miscellaneous BAM functions.
- Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
+ Copyright (C) 2008-2013, 2015, 2019-2020, 2022 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
b->core.flag |= BAM_FUNMAP;
return -1;
}
+
+/* Calculate the current read's start based on the stored cigar string. */
+hts_pos_t unclipped_start(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ int64_t clipped = 0;
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; i++) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return b->core.pos - clipped + 1;
+}
+
+/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
+hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) {
+ char *c = cigar;
+ int64_t clipped = 0;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ if (*c == 'S' || *c == 'H') { // clips
+ clipped += num;
+ } else {
+ break;
+ }
+
+ c++;
+ }
+
+ return op - clipped + 1;
+}
+
+/* Calculate the current read's end based on the stored cigar string. */
+hts_pos_t unclipped_end(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ hts_pos_t end_pos, clipped = 0;
+ int32_t i;
+
+ end_pos = bam_endpos(b);
+
+ // now get the clipped end bases (if any)
+ // if we get to the beginning of the cigar string
+ // without hitting a non-clip then the results are meaningless
+ for (i = b->core.n_cigar - 1; i >= 0; i--) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return end_pos + clipped;
+}
+
+
+/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
+hts_pos_t unclipped_other_end(int64_t op, char *cigar) {
+ char *c = cigar;
+ int64_t refpos = 0;
+ int skip = 1;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ switch (*c) {
+ case 'M':
+ case 'D':
+ case 'N':
+ case '=':
+ case 'X':
+ refpos += num;
+ skip = 0; // ignore initial clips
+ break;
+
+ case 'S':
+ case 'H':
+ if (!skip) {
+ refpos += num;
+ }
+ break;
+ }
+
+ c++;
+ }
+
+ return op + refpos;
+}
/* bam.c -- miscellaneous BAM functions.
- Copyright (C) 2008-2013, 2015, 2019-2020 Genome Research Ltd.
+ Copyright (C) 2008-2013, 2015, 2019-2020, 2022 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
b->core.flag |= BAM_FUNMAP;
return -1;
}
+
+/* Calculate the current read's start based on the stored cigar string. */
+hts_pos_t unclipped_start(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ int64_t clipped = 0;
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; i++) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return b->core.pos - clipped + 1;
+}
+
+/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
+hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) {
+ char *c = cigar;
+ int64_t clipped = 0;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ if (*c == 'S' || *c == 'H') { // clips
+ clipped += num;
+ } else {
+ break;
+ }
+
+ c++;
+ }
+
+ return op - clipped + 1;
+}
+
+/* Calculate the current read's end based on the stored cigar string. */
+hts_pos_t unclipped_end(bam1_t *b) {
+ uint32_t *cigar = bam_get_cigar(b);
+ hts_pos_t end_pos, clipped = 0;
+ int32_t i;
+
+ end_pos = bam_endpos(b);
+
+ // now get the clipped end bases (if any)
+ // if we get to the beginning of the cigar string
+ // without hitting a non-clip then the results are meaningless
+ for (i = b->core.n_cigar - 1; i >= 0; i--) {
+ char c = bam_cigar_opchr(cigar[i]);
+
+ if (c == 'S' || c == 'H') { // clips
+ clipped += bam_cigar_oplen(cigar[i]);
+ } else {
+ break;
+ }
+ }
+
+ return end_pos + clipped;
+}
+
+
+/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
+hts_pos_t unclipped_other_end(int64_t op, char *cigar) {
+ char *c = cigar;
+ int64_t refpos = 0;
+ int skip = 1;
+
+ while (*c && *c != '*') {
+ long num = 0;
+
+ if (isdigit((int)*c)) {
+ num = strtol(c, &c, 10);
+ } else {
+ num = 1;
+ }
+
+ switch (*c) {
+ case 'M':
+ case 'D':
+ case 'N':
+ case '=':
+ case 'X':
+ refpos += num;
+ skip = 0; // ignore initial clips
+ break;
+
+ case 'S':
+ case 'H':
+ if (!skip) {
+ refpos += num;
+ }
+ break;
+ }
+
+ c++;
+ }
+
+ return op + refpos;
+}
const char *bam_get_library(sam_hdr_t *header, const bam1_t *b);
+hts_pos_t unclipped_start(bam1_t *b);
+hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar);
+hts_pos_t unclipped_end(bam1_t *b);
+hts_pos_t unclipped_other_end(int64_t op, char *cigar);
+
#endif
/* bam2depth.c -- depth subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd.
+ Copyright (C) 2012-2016, 2018, 2019-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
fprintf(fp, " -H Print a file header\n");
fprintf(fp, " -J Include reads with deletions in depth computation\n");
fprintf(fp, " -s Do not count overlapping reads within a template\n");
- sam_global_opt_help(fp, "-.---@-.");
+ sam_global_opt_help(fp, "-.--.@-.");
exit(exit_status);
}
{"min-mq", required_argument, NULL, 'Q'},
{"min-BQ", required_argument, NULL, 'q'},
{"min-bq", required_argument, NULL, 'q'},
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{NULL, 0, NULL, 0}
};
/* bam2depth.c -- depth subcommand.
Copyright (C) 2011, 2012 Broad Institute.
- Copyright (C) 2012-2016, 2018, 2019-2021 Genome Research Ltd.
+ Copyright (C) 2012-2016, 2018, 2019-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk> (to 2020)
Author: James Bonfield <jkb@sanger.ac.uk> (2021 rewrite)
fprintf(fp, " -H Print a file header\n");
fprintf(fp, " -J Include reads with deletions in depth computation\n");
fprintf(fp, " -s Do not count overlapping reads within a template\n");
- sam_global_opt_help(fp, "-.---@-.");
+ sam_global_opt_help(fp, "-.--.@-.");
samtools_exit(exit_status);
}
{"min-mq", required_argument, NULL, 'Q'},
{"min-BQ", required_argument, NULL, 'q'},
{"min-bq", required_argument, NULL, 'q'},
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{NULL, 0, NULL, 0}
};
" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
fprintf(to,
-" -T TAGLIST copy arbitrary tags to the %s header line\n",
+" -T TAGLIST copy arbitrary tags to the %s header line, '*' for all\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
" -v INT default quality score if not given in file [1]\n"
hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
- kstring_t tag_list = {0,0};
- if (state->copy_tags)
- kputs("RG,BC,QT", &tag_list);
- if (opts->extra_tags) {
+ if (opts->extra_tags && (*opts->extra_tags == '*' || *opts->extra_tags == '\0'))
+ hts_set_opt(fp, FASTQ_OPT_AUX, NULL);
+ else {
+ kstring_t tag_list = {0,0};
+ if (state->copy_tags)
+ kputs("RG,BC,QT", &tag_list);
+ if (opts->extra_tags) {
+ if (tag_list.l)
+ kputc(',', &tag_list);
+ kputs(opts->extra_tags, &tag_list);
+ }
if (tag_list.l)
- kputc(',', &tag_list);
- kputs(opts->extra_tags, &tag_list);
+ hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
+ ks_free(&tag_list);
}
- if (tag_list.l)
- hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
- ks_free(&tag_list);
}
// Open a file as normal or gzipped based on filename.
" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
fprintf(to,
-" -T TAGLIST copy arbitrary tags to the %s header line\n",
+" -T TAGLIST copy arbitrary tags to the %s header line, '*' for all\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
" -v INT default quality score if not given in file [1]\n"
hts_set_opt(fp, FASTQ_OPT_BARCODE, opts->barcode_tag);
- kstring_t tag_list = {0,0};
- if (state->copy_tags)
- kputs("RG,BC,QT", &tag_list);
- if (opts->extra_tags) {
+ if (opts->extra_tags && (*opts->extra_tags == '*' || *opts->extra_tags == '\0'))
+ hts_set_opt(fp, FASTQ_OPT_AUX, NULL);
+ else {
+ kstring_t tag_list = {0,0};
+ if (state->copy_tags)
+ kputs("RG,BC,QT", &tag_list);
+ if (opts->extra_tags) {
+ if (tag_list.l)
+ kputc(',', &tag_list);
+ kputs(opts->extra_tags, &tag_list);
+ }
if (tag_list.l)
- kputc(',', &tag_list);
- kputs(opts->extra_tags, &tag_list);
+ hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
+ ks_free(&tag_list);
}
- if (tag_list.l)
- hts_set_opt(fp, FASTQ_OPT_AUX, tag_list.s);
- ks_free(&tag_list);
}
// Open a file as normal or gzipped based on filename.
hdr_out = sam_hdr_init();
}
+ // Add a version line with the sort order to the output header
+ if (sam_hdr_add_line(hdr_out, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL) < 0) {
+ fprintf(stderr, "Could not set SO and GO in the header.\n");
+ goto err;
+ }
+
// Read group
if (opts->rg_line) {
if (*opts->rg_line != '@')
hdr_out = sam_hdr_init();
}
+ // Add a version line with the sort order to the output header
+ if (sam_hdr_add_line(hdr_out, "HD", "VN", SAM_FORMAT_VERSION, "SO", "unsorted", "GO", "query", NULL) < 0) {
+ fprintf(samtools_stderr, "Could not set SO and GO in the header.\n");
+ goto err;
+ }
+
// Read group
if (opts->rg_line) {
if (*opts->rg_line != '@')
#include <htslib/hts.h>
#include <htslib/sam.h>
+#include <htslib/hfile.h>
#include <htslib/khash.h>
#include <stdlib.h>
#include <stdio.h>
-#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <unistd.h>
#include <getopt.h>
static void index_usage(FILE *fp)
{
fprintf(fp,
-"Usage: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
+"Usage: samtools index -M [-bc] [-m INT] <in1.bam> <in2.bam>...\n"
+" or: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
"Options:\n"
" -b Generate BAI-format index for BAM files [default]\n"
" -c Generate CSI-format index for BAM files\n"
" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -M Interpret all filename arguments as files to be indexed\n"
+" -o FILE Write index to FILE [alternative to <out.index> as an argument]\n"
" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
+// Returns 1 if the file does not exist or can be positively
+// identified as an index file.
+static int nonexistent_or_index(const char *fn)
+{
+ int ret1, ret2;
+ htsFormat fmt;
+ hFILE *fp = hopen(fn, "r");
+ if (fp == NULL) return 1;
+
+ ret1 = hts_detect_format2(fp, fn, &fmt);
+ ret2 = hclose(fp);
+ if (ret1 < 0 || ret2 < 0) return 0;
+
+ return fmt.category == index_file;
+}
+
int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
+ int multiple = 0;
int n_threads = 0;
- int c, ret;
+ int n_files, c, i, ret;
+ const char *fn_idx = NULL;
- while ((c = getopt(argc, argv, "bcm:@:")) >= 0)
+ while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
+ case 'M': multiple = 1; break;
+ case 'o': fn_idx = optarg; break;
case '@': n_threads = atoi(optarg); break;
default:
index_usage(stderr);
return 1;
}
- if (optind == argc) {
- index_usage(stdout);
- return 1;
- }
+ n_files = argc - optind;
- ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads);
- switch (ret) {
- case 0:
+ if (n_files == 0) {
+ index_usage(stdout);
return 0;
+ }
- case -2:
- print_error_errno("index", "failed to open \"%s\"", argv[optind]);
- break;
+ // Handle legacy synopsis
+ if (n_files == 2 && !fn_idx && nonexistent_or_index(argv[optind+1])) {
+ n_files = 1;
+ fn_idx = argv[optind+1];
+ }
- case -3:
- print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
- break;
+ if (n_files > 1 && !multiple) {
+ print_error("index", "use -M to enable indexing more than one alignment file");
+ return EXIT_FAILURE;
+ }
- case -4:
- if (argv[optind+1])
- print_error("index", "failed to create or write index \"%s\"", argv[optind+1]);
- else
- print_error("index", "failed to create or write index");
- break;
+ if (fn_idx && n_files > 1) {
+ // TODO In future we may allow %* placeholders or similar
+ print_error("index", "can't use -o with multiple input alignment files");
+ return EXIT_FAILURE;
+ }
- default:
- print_error_errno("index", "failed to create index for \"%s\"", argv[optind]);
- break;
+ for (i = optind; i < optind + n_files; i++) {
+ ret = sam_index_build3(argv[i], fn_idx, csi? min_shift : 0, n_threads);
+ if (ret < 0) {
+ if (ret == -2)
+ print_error_errno("index", "failed to open \"%s\"", argv[i]);
+ else if (ret == -3)
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[i]);
+ else if (ret == -4 && fn_idx)
+ print_error("index", "failed to create or write index \"%s\"", fn_idx);
+ else if (ret == -4)
+ print_error("index", "failed to create or write index");
+ else
+ print_error_errno("index", "failed to create index for \"%s\"", argv[i]);
+ return EXIT_FAILURE;
+ }
}
- return EXIT_FAILURE;
+ return EXIT_SUCCESS;
}
/*
#include <htslib/hts.h>
#include <htslib/sam.h>
+#include <htslib/hfile.h>
#include <htslib/khash.h>
#include <stdlib.h>
#include <stdio.h>
-#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <unistd.h>
#include <getopt.h>
static void index_usage(FILE *fp)
{
fprintf(fp,
-"Usage: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
+"Usage: samtools index -M [-bc] [-m INT] <in1.bam> <in2.bam>...\n"
+" or: samtools index [-bc] [-m INT] <in.bam> [out.index]\n"
"Options:\n"
" -b Generate BAI-format index for BAM files [default]\n"
" -c Generate CSI-format index for BAM files\n"
" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -M Interpret all filename arguments as files to be indexed\n"
+" -o FILE Write index to FILE [alternative to <out.index> as an argument]\n"
" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
+// Returns 1 if the file does not exist or can be positively
+// identified as an index file.
+static int nonexistent_or_index(const char *fn)
+{
+ int ret1, ret2;
+ htsFormat fmt;
+ hFILE *fp = hopen(fn, "r");
+ if (fp == NULL) return 1;
+
+ ret1 = hts_detect_format2(fp, fn, &fmt);
+ ret2 = hclose(fp);
+ if (ret1 < 0 || ret2 < 0) return 0;
+
+ return fmt.category == index_file;
+}
+
int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
+ int multiple = 0;
int n_threads = 0;
- int c, ret;
+ int n_files, c, i, ret;
+ const char *fn_idx = NULL;
- while ((c = getopt(argc, argv, "bcm:@:")) >= 0)
+ while ((c = getopt(argc, argv, "bcm:Mo:@:")) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
+ case 'M': multiple = 1; break;
+ case 'o': fn_idx = optarg; break;
case '@': n_threads = atoi(optarg); break;
default:
index_usage(samtools_stderr);
return 1;
}
- if (optind == argc) {
- index_usage(samtools_stdout);
- return 1;
- }
+ n_files = argc - optind;
- ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads);
- switch (ret) {
- case 0:
+ if (n_files == 0) {
+ index_usage(samtools_stdout);
return 0;
+ }
- case -2:
- print_error_errno("index", "failed to open \"%s\"", argv[optind]);
- break;
+ // Handle legacy synopsis
+ if (n_files == 2 && !fn_idx && nonexistent_or_index(argv[optind+1])) {
+ n_files = 1;
+ fn_idx = argv[optind+1];
+ }
- case -3:
- print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
- break;
+ if (n_files > 1 && !multiple) {
+ print_error("index", "use -M to enable indexing more than one alignment file");
+ return EXIT_FAILURE;
+ }
- case -4:
- if (argv[optind+1])
- print_error("index", "failed to create or write index \"%s\"", argv[optind+1]);
- else
- print_error("index", "failed to create or write index");
- break;
+ if (fn_idx && n_files > 1) {
+ // TODO In future we may allow %* placeholders or similar
+ print_error("index", "can't use -o with multiple input alignment files");
+ return EXIT_FAILURE;
+ }
- default:
- print_error_errno("index", "failed to create index for \"%s\"", argv[optind]);
- break;
+ for (i = optind; i < optind + n_files; i++) {
+ ret = sam_index_build3(argv[i], fn_idx, csi? min_shift : 0, n_threads);
+ if (ret < 0) {
+ if (ret == -2)
+ print_error_errno("index", "failed to open \"%s\"", argv[i]);
+ else if (ret == -3)
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[i]);
+ else if (ret == -4 && fn_idx)
+ print_error("index", "failed to create or write index \"%s\"", fn_idx);
+ else if (ret == -4)
+ print_error("index", "failed to create or write index");
+ else
+ print_error_errno("index", "failed to create index for \"%s\"", argv[i]);
+ return EXIT_FAILURE;
+ }
}
- return EXIT_FAILURE;
+ return EXIT_SUCCESS;
}
/*
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2021 Genome Research Ltd.
+ Copyright (C) 2017-2022 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
#include "htslib/klist.h"
#include "htslib/kstring.h"
#include "tmp_file.h"
+#include "bam.h"
typedef struct {
int rgx_x;
int rgx_y;
int rgx_t;
+ char *barcode;
+ regex_t *bc_rgx;
} md_param_t;
typedef struct {
hts_pos_t other_coord;
int32_t this_ref;
int32_t other_ref;
+ int32_t barcode;
int8_t single;
int8_t leftmost;
int8_t orientation;
khint_t hash;
if (key.single) {
- unsigned char sig[13];
+ unsigned char sig[17];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.orientation, 1); i += 1;
+ memcpy(sig + i, &key.barcode, 4); i += 4;
hash = do_hash(sig, i);
} else {
- unsigned char sig[26];
+ unsigned char sig[30];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.other_coord, 8); i += 8;
memcpy(sig + i, &key.leftmost, 1); i += 1;
memcpy(sig + i, &key.orientation, 1); i += 1;
+ memcpy(sig + i, &key.barcode, 4); i += 4;
hash = do_hash(sig, i);
}
match = 0;
else if (a.single != b.single)
match = 0;
+ else if (a.barcode != b.barcode)
+ match = 0;
if (!a.single) {
if (a.other_coord != b.other_coord)
KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id
-
-/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
-
-static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) {
- char *c = cigar;
- int64_t clipped = 0;
-
- while (*c && *c != '*') {
- long num = 0;
-
- if (isdigit((int)*c)) {
- num = strtol(c, &c, 10);
- } else {
- num = 1;
- }
-
- if (*c == 'S' || *c == 'H') { // clips
- clipped += num;
- } else {
- break;
- }
-
- c++;
- }
-
- return op - clipped + 1;
-}
-
-
-/* Calculate the current read's start based on the stored cigar string. */
-
-static hts_pos_t unclipped_start(bam1_t *b) {
- uint32_t *cigar = bam_get_cigar(b);
- int64_t clipped = 0;
- uint32_t i;
-
- for (i = 0; i < b->core.n_cigar; i++) {
- char c = bam_cigar_opchr(cigar[i]);
-
- if (c == 'S' || c == 'H') { // clips
- clipped += bam_cigar_oplen(cigar[i]);
- } else {
- break;
- }
- }
-
- return b->core.pos - clipped + 1;
-}
-
-
-/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
-
-static hts_pos_t unclipped_other_end(int64_t op, char *cigar) {
- char *c = cigar;
- int64_t refpos = 0;
- int skip = 1;
-
- while (*c && *c != '*') {
- long num = 0;
-
- if (isdigit((int)*c)) {
- num = strtol(c, &c, 10);
- } else {
- num = 1;
- }
-
- switch (*c) {
- case 'M':
- case 'D':
- case 'N':
- case '=':
- case 'X':
- refpos += num;
- skip = 0; // ignore initial clips
- break;
-
- case 'S':
- case 'H':
- if (!skip) {
- refpos += num;
- }
- break;
- }
-
- c++;
- }
-
- return op + refpos;
-}
-
-
-/* Calculate the current read's end based on the stored cigar string. */
-
-static hts_pos_t unclipped_end(bam1_t *b) {
- uint32_t *cigar = bam_get_cigar(b);
- hts_pos_t end_pos, clipped = 0;
- int32_t i;
-
- end_pos = bam_endpos(b);
-
- // now get the clipped end bases (if any)
- // if we get to the beginning of the cigar string
- // without hitting a non-clip then the results are meaningless
- for (i = b->core.n_cigar - 1; i >= 0; i--) {
- char c = bam_cigar_opchr(cigar[i]);
-
- if (c == 'S' || c == 'H') { // clips
- clipped += bam_cigar_oplen(cigar[i]);
- } else {
- break;
- }
- }
-
- return end_pos + clipped;
-}
-
-
/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
static khint32_t do_hash(unsigned char *key, khint32_t len) {
the reference id, orientation and whether the current
read is leftmost of the pair. */
-static int make_pair_key_template(key_data_t *key, bam1_t *bam) {
- hts_pos_t this_coord, other_coord, this_end, other_end;
- int32_t this_ref, other_ref;
- int8_t orientation, leftmost;
+
+static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
+ hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
+ int32_t this_ref, other_ref, barcode = 0;
+ int8_t orientation, left_read;
uint8_t *data;
- char *cig;
+ char *cig, *bar;
+ long incoming_warnings = *warnings;
this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
other_ref = bam->core.mtid + 1;
}
// work out orientations
- if (this_ref != other_ref) {
- leftmost = this_ref < other_ref;
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- leftmost = this_coord <= other_coord;
- } else {
- leftmost = this_end <= other_end;
- }
+ if (param->mode == MD_MODE_TEMPLATE) {
+
+ if (this_ref != other_ref) {
+ leftmost = this_ref < other_ref;
} else {
- if (bam_is_rev(bam)) {
- leftmost = this_end <= other_coord;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ if (!bam_is_rev(bam)) {
+ leftmost = this_coord <= other_coord;
+ } else {
+ leftmost = this_end <= other_end;
+ }
} else {
- leftmost = this_coord <= other_end;
+ if (bam_is_rev(bam)) {
+ leftmost = this_end <= other_coord;
+ } else {
+ leftmost = this_coord <= other_end;
+ }
}
}
- }
- // pair orientation
- if (leftmost) {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- other_coord = other_end;
+ // pair orientation
+ if (leftmost) {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ other_coord = other_end;
- if (!bam_is_rev(bam)) {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_FF;
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
} else {
- orientation = O_RR;
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
}
} else {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_RR;
+ if (!bam_is_rev(bam)) {
+ orientation = O_FR;
+ other_coord = other_end;
} else {
- orientation = O_FF;
+ orientation = O_RF;
+ this_coord = this_end;
}
}
} else {
- if (!bam_is_rev(bam)) {
- orientation = O_FR;
- other_coord = other_end;
- } else {
- orientation = O_RF;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
this_coord = this_end;
- }
- }
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- this_coord = this_end;
- if (!bam_is_rev(bam)) {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_RR;
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
} else {
- orientation = O_FF;
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
}
} else {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_FF;
+ if (!bam_is_rev(bam)) {
+ orientation = O_RF;
+ other_coord = other_end;
} else {
- orientation = O_RR;
+ orientation = O_FR;
+ this_coord = this_end;
}
}
- } else {
- if (!bam_is_rev(bam)) {
- orientation = O_RF;
- other_coord = other_end;
- } else {
- orientation = O_FR;
- this_coord = this_end;
- }
}
- }
-
- if (!leftmost)
- leftmost = R_RI;
- else
- leftmost = R_LE;
+ } else { // MD_MODE_SEQUENCE
- key->single = 0;
- key->this_ref = this_ref;
- key->this_coord = this_coord;
- key->other_ref = other_ref;
- key->other_coord = other_coord;
- key->leftmost = leftmost;
- key->orientation = orientation;
-
- return 0;
-}
-
-
-static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) {
- hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
- int32_t this_ref, other_ref;
- int8_t orientation, left_read;
- uint8_t *data;
- char *cig;
-
- this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
- other_ref = bam->core.mtid + 1;
-
- this_coord = unclipped_start(bam);
- this_end = unclipped_end(bam);
-
- if ((data = bam_aux_get(bam, "MC"))) {
- if (!(cig = bam_aux2Z(data))) {
- fprintf(stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
- return 1;
- }
-
- other_end = unclipped_other_end(bam->core.mpos, cig);
- other_coord = unclipped_other_start(bam->core.mpos, cig);
- } else {
- fprintf(stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
- return 1;
- }
-
- // work out orientations
- if (this_ref != other_ref) {
- leftmost = this_ref - other_ref;
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- leftmost = this_coord - other_coord;
- } else {
- leftmost = this_end - other_end;
- }
+ if (this_ref != other_ref) {
+ leftmost = this_ref - other_ref;
} else {
- if (bam_is_rev(bam)) {
- leftmost = this_end - other_coord;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ if (!bam_is_rev(bam)) {
+ leftmost = this_coord - other_coord;
+ } else {
+ leftmost = this_end - other_end;
+ }
} else {
- leftmost = this_coord - other_end;
+ if (bam_is_rev(bam)) {
+ leftmost = this_end - other_coord;
+ } else {
+ leftmost = this_coord - other_end;
+ }
}
}
- }
- if (leftmost < 0) {
- leftmost = 1;
- } else if (leftmost > 0) {
- leftmost = 0;
- } else {
- // tie breaks
+ if (leftmost < 0) {
+ leftmost = 1;
+ } else if (leftmost > 0) {
+ leftmost = 0;
+ } else {
+ // tie breaks
- if (bam->core.pos == bam->core.mpos) {
- if (bam->core.flag & BAM_FREAD1) {
+ if (bam->core.pos == bam->core.mpos) {
+ if (bam->core.flag & BAM_FREAD1) {
+ leftmost = 1;
+ } else {
+ leftmost = 0;
+ }
+ } else if (bam->core.pos < bam->core.mpos) {
leftmost = 1;
} else {
leftmost = 0;
}
- } else if (bam->core.pos < bam->core.mpos) {
- leftmost = 1;
- } else {
- leftmost = 0;
}
- }
- // pair orientation
- if (leftmost) {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ // pair orientation
+ if (leftmost) {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- orientation = O_FF;
+ if (!bam_is_rev(bam)) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
} else {
- orientation = O_RR;
+ if (!bam_is_rev(bam)) {
+ orientation = O_FR;
+ } else {
+ orientation = O_RF;
+ }
}
} else {
- if (!bam_is_rev(bam)) {
- orientation = O_FR;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+
+ if (!bam_is_rev(bam)) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
} else {
- orientation = O_RF;
+ if (!bam_is_rev(bam)) {
+ orientation = O_RF;
+ } else {
+ orientation = O_FR;
+ }
}
}
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- orientation = O_RR;
- } else {
- orientation = O_FF;
- }
+ if (!bam_is_rev(bam)) {
+ this_coord = unclipped_start(bam);
} else {
- if (!bam_is_rev(bam)) {
- orientation = O_RF;
- } else {
- orientation = O_FR;
- }
+ this_coord = unclipped_end(bam);
+ }
+
+ if (!bam_is_mrev(bam)) {
+ other_coord = unclipped_other_start(bam->core.mpos, cig);
+ } else {
+ other_coord = unclipped_other_end(bam->core.mpos, cig);
}
}
else
left_read = R_LE;
- if (!bam_is_rev(bam)) {
- this_coord = unclipped_start(bam);
- } else {
- this_coord = unclipped_end(bam);
+ if (param->barcode) {
+ if ((data = bam_aux_get(bam, param->barcode))) {
+ if (!(bar = bam_aux2Z(data))) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ }
+ } else {
+ barcode = do_hash((unsigned char *)bar, strlen(bar));
+ }
+ }
+ } else if (param->bc_rgx) {
+ int result;
+ regmatch_t matches[3];
+ size_t max_matches = 2;
+ char *qname = bam_get_qname(bam);
+
+ if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) {
+ int bc_start, bc_end;
+
+ bc_start = matches[1].rm_so;
+ bc_end = matches[1].rm_eo;
+
+ if (bc_start != -1) {
+ barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start);
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ }
+ }
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ char warn_msg[256];
+
+ regerror(result, param->bc_rgx, warn_msg, 256);
+ fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ }
+ }
}
- if (!bam_is_mrev(bam)) {
- other_coord = unclipped_other_start(bam->core.mpos, cig);
- } else {
- other_coord = unclipped_other_end(bam->core.mpos, cig);
+ if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
+ fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ *warnings);
}
key->single = 0;
key->other_coord = other_coord;
key->leftmost = left_read;
key->orientation = orientation;
+ key->barcode = barcode;
return 0;
}
+
/* Create a signature hash of single read (or read with an unmatched pair).
Uses unclipped start (or end depending on orientation), reference id,
and orientation. */
-static void make_single_key(key_data_t *key, bam1_t *bam) {
+static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
hts_pos_t this_coord;
- int32_t this_ref;
+ int32_t this_ref, barcode = 0;
int8_t orientation;
+ uint8_t *data;
+ char *bar;
+ long incoming_warnings = *warnings;
this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
orientation = O_FF;
}
+ if (param->barcode) {
+ if ((data = bam_aux_get(bam, param->barcode))) {
+ if (!(bar = bam_aux2Z(data))) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ }
+ } else {
+ barcode = do_hash((unsigned char *)bar, strlen(bar));
+ }
+ }
+ } else if (param->bc_rgx) {
+ int result;
+ regmatch_t matches[3];
+ size_t max_matches = 2;
+ char *qname = bam_get_qname(bam);
+
+ if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) {
+ int bc_start, bc_end;
+
+ bc_start = matches[1].rm_so;
+ bc_end = matches[1].rm_eo;
+
+ if (bc_start != -1) {
+ barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start);
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ }
+ }
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ char warn_msg[256];
+
+ regerror(result, param->bc_rgx, warn_msg, 256);
+ fprintf(stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ }
+ }
+ }
+
+ if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
+ fprintf(stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ *warnings);
+ }
+
key->single = 1;
key->this_ref = this_ref;
key->this_coord = this_coord;
key->orientation = orientation;
+ key->barcode = barcode;
}
}
-/* Get the position of the coordinates from the read name. */
-static inline int get_coordinate_positions_colons(const char *qname, int *xpos, int *ypos) {
+/* Get coordinates from the standard Illumina style read names.
+ Returned values are of the x and y coordinates and a section of
+ the read name to test (t) for string equality e.g. lane and tile part. */
+
+static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
int sep = 0;
int pos = 0;
+ int xpos = 0, ypos = 0;
+ char *end;
while (qname[pos]) {
if (qname[pos] == ':') {
sep++;
if (sep == 2) {
- *xpos = pos + 1;
+ xpos = pos + 1;
} else if (sep == 3) {
- *ypos = pos + 1;
+ ypos = pos + 1;
} else if (sep == 4) { // HiSeq style names
- *xpos = *ypos;
- *ypos = pos + 1;
+ xpos = ypos;
+ ypos = pos + 1;
} else if (sep == 5) { // Newer Illumina format
- *xpos = pos + 1;
+ xpos = pos + 1;
} else if (sep == 6) {
- *ypos = pos + 1;
+ ypos = pos + 1;
}
}
pos++;
}
- return sep;
+ /* The most current Illumina read format at time of writing is:
+ @machine:run:flowcell:lane:tile:x:y:UMI or
+ @machine:run:flowcell:lane:tile:x:y
+
+ Counting the separating colons gives us a quick format check.
+ Older name formats have fewer elements.
+ */
+
+ if (!(sep == 3 || sep == 4 || sep == 6 || sep == 7)) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname);
+ }
+
+ return 1;
+ } else {
+ *x_coord = strtol(qname + xpos, &end, 10);
+
+ if ((qname + xpos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname);
+ }
+
+ return 1;
+ }
+
+ *y_coord = strtol(qname + ypos, &end, 10);
+
+ if ((qname + ypos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname);
+ }
+
+ return 1;
+ }
+
+ *t_beg = 0;
+ *t_end = xpos;
+ }
+
+ return 0;
}
-/* Get the position of the coordinates from the read name.
- Positions returned are of the x and y coordinate and an optional section of
+/* Get the coordinates from the read name.
+ Returned values are of the x and y coordinates and an optional section of
the read name to test (t) for string equality e.g. lane and tile part. */
-static inline int get_coordinate_positions_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, int *xpos, int *ypos) {
+
+static inline int get_coordinates_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
regmatch_t matches[5];
size_t max_matches = 5;
+ int xpos, ypos, xend, yend, xlen, ylen;
+ char coord[255];
+ char *end;
if (!param->rgx_t)
max_matches = 4;
if (regexec(param->rgx, qname, max_matches, matches, 0))
return -1;
- *xpos = matches[param->rgx_x].rm_so;
- *ypos = matches[param->rgx_y].rm_so;
+ xpos = matches[param->rgx_x].rm_so;
+ ypos = matches[param->rgx_y].rm_so;
if (param->rgx_t) {
*t_beg = matches[param->rgx_t].rm_so;
*t_beg = *t_end = 0;
}
- if (*xpos == -1 || *ypos == -1 || *t_beg == -1)
+ if (xpos == -1 || ypos == -1 || *t_beg == -1)
return -1;
- return 7; // 3, 4, 6 and 7 are successes in the previous function
-}
+ xend = matches[param->rgx_x].rm_eo;
+ yend = matches[param->rgx_y].rm_eo;
+ if ((xlen = xend - xpos) > 254) {
+ (*warnings)++;
-static int get_coordinate_positions(md_param_t *param, const char *qname, int *beg, int *end, int *xpos, int *ypos, long *warnings) {
- int ret = 0;
- int seps;
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen);
+ }
- if (param->rgx == NULL) {
- seps = get_coordinate_positions_colons(qname, xpos, ypos);
- *beg = 0;
- *end = *xpos;
- } else {
- seps = get_coordinate_positions_regex(param, qname, beg, end, xpos, ypos);
+ return 1;
}
- /* The most current Illumina read format at time of writing is:
- @machine:run:flowcell:lane:tile:x:y:UMI or
- @machine:run:flowcell:lane:tile:x:y
+ strncpy(coord, qname + xpos, xlen);
+ coord[xlen] = '\0';
+ *x_coord = strtol(coord, &end, 10);
- Counting the separating colons gives us a quick format check.
- Older name formats have fewer elements.
- */
-
- if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
+ if (coord == end) {
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname);
+ fprintf(stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord);
}
- ret = 1;
+ return 1;
}
- return ret;
-}
-
+ if ((ylen = yend - ypos) > 254) {
+ (*warnings)++;
-static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
- int ret = 1;
- int xpos = 0, ypos = 0;
- long x = 0, y = 0;
- char *end;
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen);
+ }
- if (get_coordinate_positions(param, name, t_beg, t_end, &xpos, &ypos, warnings)) {
- return ret;
+ return 1;
}
- x = strtol(name + xpos, &end, 10);
+ strncpy(coord, qname + ypos, ylen);
+ coord[ylen] = '\0';
+ *y_coord = strtol(coord, &end, 10);
- if ((name + xpos) == end) {
+ if (coord == end) {
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
+ fprintf(stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord);
}
- return ret;
+ return 1;
}
- y = strtol(name + ypos, &end, 10);
+ return 0;
+}
- if ((name + ypos) == end) {
- (*warnings)++;
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
- }
+static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
+ int ret = 1;
- return ret;
+ if (param->rgx == NULL) {
+ ret = get_coordinates_colons(param, name, t_beg, t_end, x_coord, y_coord, warnings);
+ } else {
+ ret = get_coordinates_regex(param, name, t_beg, t_end, x_coord, y_coord, warnings);
}
- *x_coord = x;
- *y_coord = y;
- ret = 0;
-
return ret;
}
-/* Using the coordinates from the Illumina read name, see whether the duplicated read is
+/* Using the coordinates from the read name, see whether the duplicated read is
close enough (set by max_dist) to the original to be counted as optical.*/
-static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) {
+static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) {
int ret = 0;
char *original, *duplicate;
- int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0;
+ long ox, oy, dx, dy;
int o_beg = 0, o_end = 0, d_beg = 0, d_end = 0;
-
original = bam_get_qname(ori);
duplicate = bam_get_qname(dup);
- if (get_coordinate_positions(param, original, &o_beg, &o_end, &oxpos, &oypos, warnings)) {
+ if (get_coordinates(param, original, &o_beg, &o_end, &ox, &oy, warnings)) {
return ret;
}
- if (get_coordinate_positions(param, duplicate, &d_beg, &d_end, &dxpos, &dypos, warnings)) {
+ if (get_coordinates(param, duplicate, &d_beg, &d_end, &dx, &dy, warnings)) {
return ret;
}
if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
- // the initial parts match, look at the numbers
- long ox, oy, dx, dy, xdiff, ydiff;
- char *end;
-
- ox = strtol(original + oxpos, &end, 10);
-
- if ((original + oxpos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original);
- }
-
- return ret;
- }
-
- dx = strtol(duplicate + dxpos, &end, 10);
-
- if ((duplicate + dxpos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate);
- }
-
- return ret;
- }
+ long xdiff, ydiff;
if (ox > dx) {
xdiff = ox - dx;
if (xdiff <= max_dist) {
// still might be optical
- oy = strtol(original + oypos, &end, 10);
-
- if ((original + oypos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original);
- }
-
- return ret;
- }
-
- dy = strtol(duplicate + dypos, &end, 10);
-
- if ((duplicate + dypos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate);
- }
-
- return ret;
- }
-
if (oy > dy) {
ydiff = oy - dy;
} else {
}
if (param->opt_dist) { // mark optical duplicates
- if (optical_duplicate(param, ori, dup, param->opt_dist, warn)) {
+ if (is_optical_duplicate(param, ori, dup, param->opt_dist, warn)) {
bam_aux_update_str(dup, "dt", 3, "SQ");
dup_type = 'O';
(*optical)++;
int ret;
long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical;
long np_duplicate, np_opt_duplicate;
- long opt_warnings = 0;
+ long opt_warnings = 0, bc_warnings = 0;
tmp_file_t temp;
char *idx_fn = NULL;
int exclude = 0;
key_data_t single_key;
in_hash_t *bp;
- if (param->mode) {
- if (make_pair_key_sequence(&pair_key, in_read->b)) {
- fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
- goto fail;
- }
- } else {
- if (make_pair_key_template(&pair_key, in_read->b)) {
- fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
- goto fail;
- }
+ if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) {
+ fprintf(stderr, "[markdup] error: unable to assign pair hash key.\n");
+ goto fail;
}
- make_single_key(&single_key, in_read->b);
+ make_single_key(param, &single_key, in_read->b, &bc_warnings);
pair++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
key_data_t single_key;
in_hash_t *bp;
- make_single_key(&single_key, in_read->b);
+ make_single_key(param, &single_key, in_read->b, &bc_warnings);
single++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
opt_warnings);
}
+ if (bc_warnings) {
+ fprintf(stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings);
+ }
+
if (param->do_stats) {
FILE *fp;
int file_open = 0;
fprintf(stderr, " --read-coords STR Regex for coords from read name.\n");
fprintf(stderr, " --coords-order STR Order of regex elements. txy (default). With t being a part of\n"
" the read names that must be equal and x/y being coordinates.\n");
+ fprintf(stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n");
+ fprintf(stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n");
+ fprintf(stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n");
fprintf(stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
int bam_markdup(int argc, char **argv) {
- int c, ret;
+ int c, ret, bc_name = 0;
char wmode[4] = {'w', 'b', 0, 0};
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
kstring_t tmpprefix = {0, 0, NULL};
struct stat st;
unsigned int t;
- char *regex = NULL;
+ char *regex = NULL, *bc_regex = NULL;
char *regex_order = "txy";
- md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL, NULL, 0, 0, 0};
+ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"no-multi-dup", no_argument, NULL, 1003},
{"read-coords", required_argument, NULL, 1004},
{"coords-order", required_argument, NULL, 1005},
+ {"barcode-tag", required_argument, NULL, 1006},
+ {"barcode-name", no_argument, NULL, 1007},
+ {"barcode-rgx", required_argument, NULL, 1008},
{NULL, 0, NULL, 0}
};
case 1003: param.check_chain = 0; break;
case 1004: regex = optarg; break;
case 1005: regex_order = optarg; break;
+ case 1006: param.barcode = optarg; break;
+ case 1007: bc_name = 1; break;
+ case 1008: bc_name = 1, bc_regex = optarg; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
if (optind + 2 > argc)
return markdup_usage();
+ if (param.barcode && bc_name) {
+ fprintf(stderr, "[markdup] Error: cannot specify --barcode-tag and "
+ "--barcode-name (or --barcode-rgx) at same time.\n");
+ return 1;
+ }
+
if (param.opt_dist < 0) param.opt_dist = 0;
if (param.max_length < 0) param.max_length = 300;
param.rgx_y = 2;
param.rgx_t = 0;
} else {
- fprintf(stderr, "[markdup] error: could not recognise regex coorindate order \"%s\".\n", regex_order);
+ fprintf(stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order);
return 1;
}
}
}
+ if (bc_name) {
+ int result;
+
+ /* From Illumina UMI documentation: "The UMI sequence is located in the
+ eighth colon-delimited field of the read name (QNAME)". */
+ char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)";
+
+ if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) {
+ fprintf(stderr, "[markdup] error: could not allocate memory for barcode regex.\n");
+ return 1;
+ }
+
+ if (bc_regex) {
+ rgx = bc_regex;
+ }
+
+ if ((result = regcomp(param.bc_rgx, rgx, REG_EXTENDED))) {
+ char err_msg[256];
+
+ regerror(result, param.bc_rgx, err_msg, 256);
+ fprintf(stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg);
+ free(param.bc_rgx);
+ return 1;
+ }
+ }
+
param.in = sam_open_format(argv[optind], "r", &ga.in);
if (!param.in) {
free(param.rgx);
}
+ if (param.bc_rgx) {
+ regfree(param.bc_rgx);
+ free(param.bc_rgx);
+ }
+
free(param.arg_list);
free(tmpprefix.s);
sam_global_args_free(&ga);
/* bam_markdup.c -- Mark duplicates from a coord sorted file that has gone
through fixmates with the mate scoring option on.
- Copyright (C) 2017-2021 Genome Research Ltd.
+ Copyright (C) 2017-2022 Genome Research Ltd.
Author: Andrew Whitwham <aw7@sanger.ac.uk>
#include "htslib/klist.h"
#include "htslib/kstring.h"
#include "tmp_file.h"
+#include "bam.h"
typedef struct {
int rgx_x;
int rgx_y;
int rgx_t;
+ char *barcode;
+ regex_t *bc_rgx;
} md_param_t;
typedef struct {
hts_pos_t other_coord;
int32_t this_ref;
int32_t other_ref;
+ int32_t barcode;
int8_t single;
int8_t leftmost;
int8_t orientation;
khint_t hash;
if (key.single) {
- unsigned char sig[13];
+ unsigned char sig[17];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.orientation, 1); i += 1;
+ memcpy(sig + i, &key.barcode, 4); i += 4;
hash = do_hash(sig, i);
} else {
- unsigned char sig[26];
+ unsigned char sig[30];
memcpy(sig + i, &key.this_ref, 4); i += 4;
memcpy(sig + i, &key.this_coord, 8); i += 8;
memcpy(sig + i, &key.other_coord, 8); i += 8;
memcpy(sig + i, &key.leftmost, 1); i += 1;
memcpy(sig + i, &key.orientation, 1); i += 1;
+ memcpy(sig + i, &key.barcode, 4); i += 4;
hash = do_hash(sig, i);
}
match = 0;
else if (a.single != b.single)
match = 0;
+ else if (a.barcode != b.barcode)
+ match = 0;
if (!a.single) {
if (a.other_coord != b.other_coord)
KLIST_INIT(read_queue, read_queue_t, __free_queue_element) // the reads buffer
KHASH_MAP_INIT_STR(duplicates, dup_map_t) // map of duplicates for supplementary dup id
-
-/* Calculate the mate's unclipped start based on position and cigar string from MC tag. */
-
-static hts_pos_t unclipped_other_start(hts_pos_t op, char *cigar) {
- char *c = cigar;
- int64_t clipped = 0;
-
- while (*c && *c != '*') {
- long num = 0;
-
- if (isdigit((int)*c)) {
- num = strtol(c, &c, 10);
- } else {
- num = 1;
- }
-
- if (*c == 'S' || *c == 'H') { // clips
- clipped += num;
- } else {
- break;
- }
-
- c++;
- }
-
- return op - clipped + 1;
-}
-
-
-/* Calculate the current read's start based on the stored cigar string. */
-
-static hts_pos_t unclipped_start(bam1_t *b) {
- uint32_t *cigar = bam_get_cigar(b);
- int64_t clipped = 0;
- uint32_t i;
-
- for (i = 0; i < b->core.n_cigar; i++) {
- char c = bam_cigar_opchr(cigar[i]);
-
- if (c == 'S' || c == 'H') { // clips
- clipped += bam_cigar_oplen(cigar[i]);
- } else {
- break;
- }
- }
-
- return b->core.pos - clipped + 1;
-}
-
-
-/* Calculate the mate's unclipped end based on start position and cigar string from MC tag.*/
-
-static hts_pos_t unclipped_other_end(int64_t op, char *cigar) {
- char *c = cigar;
- int64_t refpos = 0;
- int skip = 1;
-
- while (*c && *c != '*') {
- long num = 0;
-
- if (isdigit((int)*c)) {
- num = strtol(c, &c, 10);
- } else {
- num = 1;
- }
-
- switch (*c) {
- case 'M':
- case 'D':
- case 'N':
- case '=':
- case 'X':
- refpos += num;
- skip = 0; // ignore initial clips
- break;
-
- case 'S':
- case 'H':
- if (!skip) {
- refpos += num;
- }
- break;
- }
-
- c++;
- }
-
- return op + refpos;
-}
-
-
-/* Calculate the current read's end based on the stored cigar string. */
-
-static hts_pos_t unclipped_end(bam1_t *b) {
- uint32_t *cigar = bam_get_cigar(b);
- hts_pos_t end_pos, clipped = 0;
- int32_t i;
-
- end_pos = bam_endpos(b);
-
- // now get the clipped end bases (if any)
- // if we get to the beginning of the cigar string
- // without hitting a non-clip then the results are meaningless
- for (i = b->core.n_cigar - 1; i >= 0; i--) {
- char c = bam_cigar_opchr(cigar[i]);
-
- if (c == 'S' || c == 'H') { // clips
- clipped += bam_cigar_oplen(cigar[i]);
- } else {
- break;
- }
- }
-
- return end_pos + clipped;
-}
-
-
/* The Bob Jenkins one_at_a_time hash to reduce the key to a 32 bit value. */
static khint32_t do_hash(unsigned char *key, khint32_t len) {
the reference id, orientation and whether the current
read is leftmost of the pair. */
-static int make_pair_key_template(key_data_t *key, bam1_t *bam) {
- hts_pos_t this_coord, other_coord, this_end, other_end;
- int32_t this_ref, other_ref;
- int8_t orientation, leftmost;
+
+static int make_pair_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
+ hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
+ int32_t this_ref, other_ref, barcode = 0;
+ int8_t orientation, left_read;
uint8_t *data;
- char *cig;
+ char *cig, *bar;
+ long incoming_warnings = *warnings;
this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
other_ref = bam->core.mtid + 1;
}
// work out orientations
- if (this_ref != other_ref) {
- leftmost = this_ref < other_ref;
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- leftmost = this_coord <= other_coord;
- } else {
- leftmost = this_end <= other_end;
- }
+ if (param->mode == MD_MODE_TEMPLATE) {
+
+ if (this_ref != other_ref) {
+ leftmost = this_ref < other_ref;
} else {
- if (bam_is_rev(bam)) {
- leftmost = this_end <= other_coord;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ if (!bam_is_rev(bam)) {
+ leftmost = this_coord <= other_coord;
+ } else {
+ leftmost = this_end <= other_end;
+ }
} else {
- leftmost = this_coord <= other_end;
+ if (bam_is_rev(bam)) {
+ leftmost = this_end <= other_coord;
+ } else {
+ leftmost = this_coord <= other_end;
+ }
}
}
- }
- // pair orientation
- if (leftmost) {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- other_coord = other_end;
+ // pair orientation
+ if (leftmost) {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ other_coord = other_end;
- if (!bam_is_rev(bam)) {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_FF;
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
} else {
- orientation = O_RR;
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
}
} else {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_RR;
+ if (!bam_is_rev(bam)) {
+ orientation = O_FR;
+ other_coord = other_end;
} else {
- orientation = O_FF;
+ orientation = O_RF;
+ this_coord = this_end;
}
}
} else {
- if (!bam_is_rev(bam)) {
- orientation = O_FR;
- other_coord = other_end;
- } else {
- orientation = O_RF;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
this_coord = this_end;
- }
- }
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- this_coord = this_end;
- if (!bam_is_rev(bam)) {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_RR;
+ if (!bam_is_rev(bam)) {
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
} else {
- orientation = O_FF;
+ if (bam->core.flag & BAM_FREAD1) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
}
} else {
- if (bam->core.flag & BAM_FREAD1) {
- orientation = O_FF;
+ if (!bam_is_rev(bam)) {
+ orientation = O_RF;
+ other_coord = other_end;
} else {
- orientation = O_RR;
+ orientation = O_FR;
+ this_coord = this_end;
}
}
- } else {
- if (!bam_is_rev(bam)) {
- orientation = O_RF;
- other_coord = other_end;
- } else {
- orientation = O_FR;
- this_coord = this_end;
- }
}
- }
-
- if (!leftmost)
- leftmost = R_RI;
- else
- leftmost = R_LE;
+ } else { // MD_MODE_SEQUENCE
- key->single = 0;
- key->this_ref = this_ref;
- key->this_coord = this_coord;
- key->other_ref = other_ref;
- key->other_coord = other_coord;
- key->leftmost = leftmost;
- key->orientation = orientation;
-
- return 0;
-}
-
-
-static int make_pair_key_sequence(key_data_t *key, bam1_t *bam) {
- hts_pos_t this_coord, this_end, other_coord, other_end, leftmost;
- int32_t this_ref, other_ref;
- int8_t orientation, left_read;
- uint8_t *data;
- char *cig;
-
- this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
- other_ref = bam->core.mtid + 1;
-
- this_coord = unclipped_start(bam);
- this_end = unclipped_end(bam);
-
- if ((data = bam_aux_get(bam, "MC"))) {
- if (!(cig = bam_aux2Z(data))) {
- fprintf(samtools_stderr, "[markdup] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
- return 1;
- }
-
- other_end = unclipped_other_end(bam->core.mpos, cig);
- other_coord = unclipped_other_start(bam->core.mpos, cig);
- } else {
- fprintf(samtools_stderr, "[markdup] error: no MC tag. Please run samtools fixmate on file first.\n");
- return 1;
- }
-
- // work out orientations
- if (this_ref != other_ref) {
- leftmost = this_ref - other_ref;
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- leftmost = this_coord - other_coord;
- } else {
- leftmost = this_end - other_end;
- }
+ if (this_ref != other_ref) {
+ leftmost = this_ref - other_ref;
} else {
- if (bam_is_rev(bam)) {
- leftmost = this_end - other_coord;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ if (!bam_is_rev(bam)) {
+ leftmost = this_coord - other_coord;
+ } else {
+ leftmost = this_end - other_end;
+ }
} else {
- leftmost = this_coord - other_end;
+ if (bam_is_rev(bam)) {
+ leftmost = this_end - other_coord;
+ } else {
+ leftmost = this_coord - other_end;
+ }
}
}
- }
- if (leftmost < 0) {
- leftmost = 1;
- } else if (leftmost > 0) {
- leftmost = 0;
- } else {
- // tie breaks
+ if (leftmost < 0) {
+ leftmost = 1;
+ } else if (leftmost > 0) {
+ leftmost = 0;
+ } else {
+ // tie breaks
- if (bam->core.pos == bam->core.mpos) {
- if (bam->core.flag & BAM_FREAD1) {
+ if (bam->core.pos == bam->core.mpos) {
+ if (bam->core.flag & BAM_FREAD1) {
+ leftmost = 1;
+ } else {
+ leftmost = 0;
+ }
+ } else if (bam->core.pos < bam->core.mpos) {
leftmost = 1;
} else {
leftmost = 0;
}
- } else if (bam->core.pos < bam->core.mpos) {
- leftmost = 1;
- } else {
- leftmost = 0;
}
- }
- // pair orientation
- if (leftmost) {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+ // pair orientation
+ if (leftmost) {
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- orientation = O_FF;
+ if (!bam_is_rev(bam)) {
+ orientation = O_FF;
+ } else {
+ orientation = O_RR;
+ }
} else {
- orientation = O_RR;
+ if (!bam_is_rev(bam)) {
+ orientation = O_FR;
+ } else {
+ orientation = O_RF;
+ }
}
} else {
- if (!bam_is_rev(bam)) {
- orientation = O_FR;
+ if (bam_is_rev(bam) == bam_is_mrev(bam)) {
+
+ if (!bam_is_rev(bam)) {
+ orientation = O_RR;
+ } else {
+ orientation = O_FF;
+ }
} else {
- orientation = O_RF;
+ if (!bam_is_rev(bam)) {
+ orientation = O_RF;
+ } else {
+ orientation = O_FR;
+ }
}
}
- } else {
- if (bam_is_rev(bam) == bam_is_mrev(bam)) {
- if (!bam_is_rev(bam)) {
- orientation = O_RR;
- } else {
- orientation = O_FF;
- }
+ if (!bam_is_rev(bam)) {
+ this_coord = unclipped_start(bam);
} else {
- if (!bam_is_rev(bam)) {
- orientation = O_RF;
- } else {
- orientation = O_FR;
- }
+ this_coord = unclipped_end(bam);
+ }
+
+ if (!bam_is_mrev(bam)) {
+ other_coord = unclipped_other_start(bam->core.mpos, cig);
+ } else {
+ other_coord = unclipped_other_end(bam->core.mpos, cig);
}
}
else
left_read = R_LE;
- if (!bam_is_rev(bam)) {
- this_coord = unclipped_start(bam);
- } else {
- this_coord = unclipped_end(bam);
+ if (param->barcode) {
+ if ((data = bam_aux_get(bam, param->barcode))) {
+ if (!(bar = bam_aux2Z(data))) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ }
+ } else {
+ barcode = do_hash((unsigned char *)bar, strlen(bar));
+ }
+ }
+ } else if (param->bc_rgx) {
+ int result;
+ regmatch_t matches[3];
+ size_t max_matches = 2;
+ char *qname = bam_get_qname(bam);
+
+ if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) {
+ int bc_start, bc_end;
+
+ bc_start = matches[1].rm_so;
+ bc_end = matches[1].rm_eo;
+
+ if (bc_start != -1) {
+ barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start);
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ }
+ }
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ char warn_msg[256];
+
+ regerror(result, param->bc_rgx, warn_msg, 256);
+ fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ }
+ }
}
- if (!bam_is_mrev(bam)) {
- other_coord = unclipped_other_start(bam->core.mpos, cig);
- } else {
- other_coord = unclipped_other_end(bam->core.mpos, cig);
+ if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
+ fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ *warnings);
}
key->single = 0;
key->other_coord = other_coord;
key->leftmost = left_read;
key->orientation = orientation;
+ key->barcode = barcode;
return 0;
}
+
/* Create a signature hash of single read (or read with an unmatched pair).
Uses unclipped start (or end depending on orientation), reference id,
and orientation. */
-static void make_single_key(key_data_t *key, bam1_t *bam) {
+static void make_single_key(md_param_t *param, key_data_t *key, bam1_t *bam, long *warnings) {
hts_pos_t this_coord;
- int32_t this_ref;
+ int32_t this_ref, barcode = 0;
int8_t orientation;
+ uint8_t *data;
+ char *bar;
+ long incoming_warnings = *warnings;
this_ref = bam->core.tid + 1; // avoid a 0 being put into the hash
orientation = O_FF;
}
+ if (param->barcode) {
+ if ((data = bam_aux_get(bam, param->barcode))) {
+ if (!(bar = bam_aux2Z(data))) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: %s tag wrong type. Aux tag needs to be a string type.\n", param->barcode);
+ }
+ } else {
+ barcode = do_hash((unsigned char *)bar, strlen(bar));
+ }
+ }
+ } else if (param->bc_rgx) {
+ int result;
+ regmatch_t matches[3];
+ size_t max_matches = 2;
+ char *qname = bam_get_qname(bam);
+
+ if ((result = regexec(param->bc_rgx, qname, max_matches, matches, 0)) == 0) {
+ int bc_start, bc_end;
+
+ bc_start = matches[1].rm_so;
+ bc_end = matches[1].rm_eo;
+
+ if (bc_start != -1) {
+ barcode = do_hash((unsigned char *)qname + bc_start, bc_end - bc_start);
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: barcode regex unable to match substring on %s.\n", qname);
+ }
+ }
+ } else {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ char warn_msg[256];
+
+ regerror(result, param->bc_rgx, warn_msg, 256);
+ fprintf(samtools_stderr, "[markdup] warning: barcode regex match error \"%s\" on %s.\n", warn_msg, qname);
+ }
+ }
+ }
+
+ if ((*warnings == BMD_WARNING_MAX) && (incoming_warnings != *warnings)) {
+ fprintf(samtools_stderr, "[markdup] warning: %ld barcode read warnings. New warnings will not be reported.\n",
+ *warnings);
+ }
+
key->single = 1;
key->this_ref = this_ref;
key->this_coord = this_coord;
key->orientation = orientation;
+ key->barcode = barcode;
}
}
-/* Get the position of the coordinates from the read name. */
-static inline int get_coordinate_positions_colons(const char *qname, int *xpos, int *ypos) {
+/* Get coordinates from the standard Illumina style read names.
+ Returned values are of the x and y coordinates and a section of
+ the read name to test (t) for string equality e.g. lane and tile part. */
+
+static int get_coordinates_colons(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
int sep = 0;
int pos = 0;
+ int xpos = 0, ypos = 0;
+ char *end;
while (qname[pos]) {
if (qname[pos] == ':') {
sep++;
if (sep == 2) {
- *xpos = pos + 1;
+ xpos = pos + 1;
} else if (sep == 3) {
- *ypos = pos + 1;
+ ypos = pos + 1;
} else if (sep == 4) { // HiSeq style names
- *xpos = *ypos;
- *ypos = pos + 1;
+ xpos = ypos;
+ ypos = pos + 1;
} else if (sep == 5) { // Newer Illumina format
- *xpos = pos + 1;
+ xpos = pos + 1;
} else if (sep == 6) {
- *ypos = pos + 1;
+ ypos = pos + 1;
}
}
pos++;
}
- return sep;
+ /* The most current Illumina read format at time of writing is:
+ @machine:run:flowcell:lane:tile:x:y:UMI or
+ @machine:run:flowcell:lane:tile:x:y
+
+ Counting the separating colons gives us a quick format check.
+ Older name formats have fewer elements.
+ */
+
+ if (!(sep == 3 || sep == 4 || sep == 6 || sep == 7)) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname);
+ }
+
+ return 1;
+ } else {
+ *x_coord = strtol(qname + xpos, &end, 10);
+
+ if ((qname + xpos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s .\n", qname);
+ }
+
+ return 1;
+ }
+
+ *y_coord = strtol(qname + ypos, &end, 10);
+
+ if ((qname + ypos) == end) {
+ (*warnings)++;
+
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s .\n", qname);
+ }
+
+ return 1;
+ }
+
+ *t_beg = 0;
+ *t_end = xpos;
+ }
+
+ return 0;
}
-/* Get the position of the coordinates from the read name.
- Positions returned are of the x and y coordinate and an optional section of
+/* Get the coordinates from the read name.
+ Returned values are of the x and y coordinates and an optional section of
the read name to test (t) for string equality e.g. lane and tile part. */
-static inline int get_coordinate_positions_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, int *xpos, int *ypos) {
+
+static inline int get_coordinates_regex(md_param_t *param, const char *qname, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
regmatch_t matches[5];
size_t max_matches = 5;
+ int xpos, ypos, xend, yend, xlen, ylen;
+ char coord[255];
+ char *end;
if (!param->rgx_t)
max_matches = 4;
if (regexec(param->rgx, qname, max_matches, matches, 0))
return -1;
- *xpos = matches[param->rgx_x].rm_so;
- *ypos = matches[param->rgx_y].rm_so;
+ xpos = matches[param->rgx_x].rm_so;
+ ypos = matches[param->rgx_y].rm_so;
if (param->rgx_t) {
*t_beg = matches[param->rgx_t].rm_so;
*t_beg = *t_end = 0;
}
- if (*xpos == -1 || *ypos == -1 || *t_beg == -1)
+ if (xpos == -1 || ypos == -1 || *t_beg == -1)
return -1;
- return 7; // 3, 4, 6 and 7 are successes in the previous function
-}
+ xend = matches[param->rgx_x].rm_eo;
+ yend = matches[param->rgx_y].rm_eo;
+ if ((xlen = xend - xpos) > 254) {
+ (*warnings)++;
-static int get_coordinate_positions(md_param_t *param, const char *qname, int *beg, int *end, int *xpos, int *ypos, long *warnings) {
- int ret = 0;
- int seps;
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: x coordinate string longer than allowed qname length in %s (%d long).\n", qname, xlen);
+ }
- if (param->rgx == NULL) {
- seps = get_coordinate_positions_colons(qname, xpos, ypos);
- *beg = 0;
- *end = *xpos;
- } else {
- seps = get_coordinate_positions_regex(param, qname, beg, end, xpos, ypos);
+ return 1;
}
- /* The most current Illumina read format at time of writing is:
- @machine:run:flowcell:lane:tile:x:y:UMI or
- @machine:run:flowcell:lane:tile:x:y
+ strncpy(coord, qname + xpos, xlen);
+ coord[xlen] = '\0';
+ *x_coord = strtol(coord, &end, 10);
- Counting the separating colons gives us a quick format check.
- Older name formats have fewer elements.
- */
-
- if (!(seps == 3 || seps == 4 || seps == 6 || seps == 7)) {
+ if (coord == end) {
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: cannot decipher read name %s for optical duplicate marking.\n", qname);
+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher x coordinate in %s (%s).\n", qname, coord);
}
- ret = 1;
+ return 1;
}
- return ret;
-}
-
+ if ((ylen = yend - ypos) > 254) {
+ (*warnings)++;
-static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
- int ret = 1;
- int xpos = 0, ypos = 0;
- long x = 0, y = 0;
- char *end;
+ if (*warnings <= BMD_WARNING_MAX) {
+ fprintf(samtools_stderr, "[markdup] warning: y coordinate string longer than allowed qname length in %s (%d long).\n", qname, ylen);
+ }
- if (get_coordinate_positions(param, name, t_beg, t_end, &xpos, &ypos, warnings)) {
- return ret;
+ return 1;
}
- x = strtol(name + xpos, &end, 10);
+ strncpy(coord, qname + ypos, ylen);
+ coord[ylen] = '\0';
+ *y_coord = strtol(coord, &end, 10);
- if ((name + xpos) == end) {
+ if (coord == end) {
(*warnings)++;
if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", name);
+ fprintf(samtools_stderr, "[markdup] warning: cannot decipher y coordinate in %s (%s).\n", qname, coord);
}
- return ret;
+ return 1;
}
- y = strtol(name + ypos, &end, 10);
+ return 0;
+}
- if ((name + ypos) == end) {
- (*warnings)++;
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: can not decipher y coordinate in %s .\n", name);
- }
+static int get_coordinates(md_param_t *param, const char *name, int *t_beg, int *t_end, long *x_coord, long *y_coord, long *warnings) {
+ int ret = 1;
- return ret;
+ if (param->rgx == NULL) {
+ ret = get_coordinates_colons(param, name, t_beg, t_end, x_coord, y_coord, warnings);
+ } else {
+ ret = get_coordinates_regex(param, name, t_beg, t_end, x_coord, y_coord, warnings);
}
- *x_coord = x;
- *y_coord = y;
- ret = 0;
-
return ret;
}
-/* Using the coordinates from the Illumina read name, see whether the duplicated read is
+/* Using the coordinates from the read name, see whether the duplicated read is
close enough (set by max_dist) to the original to be counted as optical.*/
-static int optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) {
+static int is_optical_duplicate(md_param_t *param, bam1_t *ori, bam1_t *dup, long max_dist, long *warnings) {
int ret = 0;
char *original, *duplicate;
- int oxpos = 0, oypos = 0, dxpos = 0, dypos = 0;
+ long ox, oy, dx, dy;
int o_beg = 0, o_end = 0, d_beg = 0, d_end = 0;
-
original = bam_get_qname(ori);
duplicate = bam_get_qname(dup);
- if (get_coordinate_positions(param, original, &o_beg, &o_end, &oxpos, &oypos, warnings)) {
+ if (get_coordinates(param, original, &o_beg, &o_end, &ox, &oy, warnings)) {
return ret;
}
- if (get_coordinate_positions(param, duplicate, &d_beg, &d_end, &dxpos, &dypos, warnings)) {
+ if (get_coordinates(param, duplicate, &d_beg, &d_end, &dx, &dy, warnings)) {
return ret;
}
if (strncmp(original + o_beg, duplicate + d_beg, o_end - o_beg) == 0) {
- // the initial parts match, look at the numbers
- long ox, oy, dx, dy, xdiff, ydiff;
- char *end;
-
- ox = strtol(original + oxpos, &end, 10);
-
- if ((original + oxpos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s .\n", original);
- }
-
- return ret;
- }
-
- dx = strtol(duplicate + dxpos, &end, 10);
-
- if ((duplicate + dxpos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: can not decipher X coordinate in %s.\n", duplicate);
- }
-
- return ret;
- }
+ long xdiff, ydiff;
if (ox > dx) {
xdiff = ox - dx;
if (xdiff <= max_dist) {
// still might be optical
- oy = strtol(original + oypos, &end, 10);
-
- if ((original + oypos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", original);
- }
-
- return ret;
- }
-
- dy = strtol(duplicate + dypos, &end, 10);
-
- if ((duplicate + dypos) == end) {
- (*warnings)++;
-
- if (*warnings <= BMD_WARNING_MAX) {
- fprintf(samtools_stderr, "[markdup] warning: can not decipher Y coordinate in %s.\n", duplicate);
- }
-
- return ret;
- }
-
if (oy > dy) {
ydiff = oy - dy;
} else {
}
if (param->opt_dist) { // mark optical duplicates
- if (optical_duplicate(param, ori, dup, param->opt_dist, warn)) {
+ if (is_optical_duplicate(param, ori, dup, param->opt_dist, warn)) {
bam_aux_update_str(dup, "dt", 3, "SQ");
dup_type = 'O';
(*optical)++;
int ret;
long reading, writing, excluded, duplicate, single, pair, single_dup, examined, optical, single_optical;
long np_duplicate, np_opt_duplicate;
- long opt_warnings = 0;
+ long opt_warnings = 0, bc_warnings = 0;
tmp_file_t temp;
char *idx_fn = NULL;
int exclude = 0;
key_data_t single_key;
in_hash_t *bp;
- if (param->mode) {
- if (make_pair_key_sequence(&pair_key, in_read->b)) {
- fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n");
- goto fail;
- }
- } else {
- if (make_pair_key_template(&pair_key, in_read->b)) {
- fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n");
- goto fail;
- }
+ if (make_pair_key(param, &pair_key, in_read->b, &bc_warnings)) {
+ fprintf(samtools_stderr, "[markdup] error: unable to assign pair hash key.\n");
+ goto fail;
}
- make_single_key(&single_key, in_read->b);
+ make_single_key(param, &single_key, in_read->b, &bc_warnings);
pair++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
key_data_t single_key;
in_hash_t *bp;
- make_single_key(&single_key, in_read->b);
+ make_single_key(param, &single_key, in_read->b, &bc_warnings);
single++;
in_read->pos = single_key.this_coord; // cigar/orientation modified pos
opt_warnings);
}
+ if (bc_warnings) {
+ fprintf(samtools_stderr, "[markdup] warning: number of failed attempts to get barcodes = %ld\n", bc_warnings);
+ }
+
if (param->do_stats) {
FILE *fp;
int file_open = 0;
fprintf(samtools_stderr, " --read-coords STR Regex for coords from read name.\n");
fprintf(samtools_stderr, " --coords-order STR Order of regex elements. txy (default). With t being a part of\n"
" the read names that must be equal and x/y being coordinates.\n");
+ fprintf(samtools_stderr, " --barcode-tag STR Use barcode a tag that duplicates much match.\n");
+ fprintf(samtools_stderr, " --barcode-name Use the UMI/barcode in the read name (eigth colon delimited part).\n");
+ fprintf(samtools_stderr, " --barcode-rgx STR Regex for barcode in the readname (alternative to --barcode-name).\n");
fprintf(samtools_stderr, " -t Mark primary duplicates with the name of the original in a \'do\' tag."
" Mainly for information and debugging.\n");
int bam_markdup(int argc, char **argv) {
- int c, ret;
+ int c, ret, bc_name = 0;
char wmode[4] = {'w', 'b', 0, 0};
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
htsThreadPool p = {NULL, 0};
kstring_t tmpprefix = {0, 0, NULL};
struct stat st;
unsigned int t;
- char *regex = NULL;
+ char *regex = NULL, *bc_regex = NULL;
char *regex_order = "txy";
- md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, NULL, NULL, NULL, NULL, 0, 0, 0};
+ md_param_t param = {NULL, NULL, NULL, 0, 300, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, NULL, NULL, NULL, NULL, 0, 0, 0, NULL, NULL};
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{"no-multi-dup", no_argument, NULL, 1003},
{"read-coords", required_argument, NULL, 1004},
{"coords-order", required_argument, NULL, 1005},
+ {"barcode-tag", required_argument, NULL, 1006},
+ {"barcode-name", no_argument, NULL, 1007},
+ {"barcode-rgx", required_argument, NULL, 1008},
{NULL, 0, NULL, 0}
};
case 1003: param.check_chain = 0; break;
case 1004: regex = optarg; break;
case 1005: regex_order = optarg; break;
+ case 1006: param.barcode = optarg; break;
+ case 1007: bc_name = 1; break;
+ case 1008: bc_name = 1, bc_regex = optarg; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': return markdup_usage();
if (optind + 2 > argc)
return markdup_usage();
+ if (param.barcode && bc_name) {
+ fprintf(samtools_stderr, "[markdup] Error: cannot specify --barcode-tag and "
+ "--barcode-name (or --barcode-rgx) at same time.\n");
+ return 1;
+ }
+
if (param.opt_dist < 0) param.opt_dist = 0;
if (param.max_length < 0) param.max_length = 300;
param.rgx_y = 2;
param.rgx_t = 0;
} else {
- fprintf(samtools_stderr, "[markdup] error: could not recognise regex coorindate order \"%s\".\n", regex_order);
+ fprintf(samtools_stderr, "[markdup] error: could not recognise regex coordinate order \"%s\".\n", regex_order);
return 1;
}
}
}
+ if (bc_name) {
+ int result;
+
+ /* From Illumina UMI documentation: "The UMI sequence is located in the
+ eighth colon-delimited field of the read name (QNAME)". */
+ char *rgx = "[0-9A-Za-z]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:[0-9]+:([!-?A-~]+)";
+
+ if ((param.bc_rgx = malloc(sizeof(regex_t))) == NULL) {
+ fprintf(samtools_stderr, "[markdup] error: could not allocate memory for barcode regex.\n");
+ return 1;
+ }
+
+ if (bc_regex) {
+ rgx = bc_regex;
+ }
+
+ if ((result = regcomp(param.bc_rgx, rgx, REG_EXTENDED))) {
+ char err_msg[256];
+
+ regerror(result, param.bc_rgx, err_msg, 256);
+ fprintf(samtools_stderr, "[markdup] error: barcode regex error \"%s\"\n", err_msg);
+ free(param.bc_rgx);
+ return 1;
+ }
+ }
+
param.in = sam_open_format(argv[optind], "r", &ga.in);
if (!param.in) {
free(param.rgx);
}
+ if (param.bc_rgx) {
+ regfree(param.bc_rgx);
+ free(param.bc_rgx);
+ }
+
free(param.arg_list);
free(tmpprefix.s);
sam_global_args_free(&ga);
" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
-" -x, --ignore-overlaps disable read-pair overlap detection\n"
+" -x, --ignore-overlaps-removal, --disable-overlap-removal\n"
+" disable read-pair overlap detection and removal\n"
" -X, --customized-index use customized index files\n" // -X flag for index filename
"\n"
"Output options:\n"
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"min-bq", required_argument, NULL, 'Q'},
- {"ignore-overlaps", no_argument, NULL, 'x'},
+ // NB: old "--ignore-overlaps" auto-completes to this
+ {"ignore-overlaps-removal", no_argument, NULL, 'x'},
+ {"disable-overlap-removal", no_argument, NULL, 'x'},
{"output-mods", no_argument, NULL, 'M'},
{"output-BP", no_argument, NULL, 'O'},
{"output-bp", no_argument, NULL, 'O'},
" --ff, --excl-flags STR|INT filter flags: skip reads with any of the mask bits set\n"
" [%s]\n", tmp_filter);
fprintf(fp,
-" -x, --ignore-overlaps disable read-pair overlap detection\n"
+" -x, --ignore-overlaps-removal, --disable-overlap-removal\n"
+" disable read-pair overlap detection and removal\n"
" -X, --customized-index use customized index files\n" // -X flag for index filename
"\n"
"Output options:\n"
{"min-mq", required_argument, NULL, 'q'},
{"min-BQ", required_argument, NULL, 'Q'},
{"min-bq", required_argument, NULL, 'Q'},
- {"ignore-overlaps", no_argument, NULL, 'x'},
+ // NB: old "--ignore-overlaps" auto-completes to this
+ {"ignore-overlaps-removal", no_argument, NULL, 'x'},
+ {"disable-overlap-removal", no_argument, NULL, 'x'},
{"output-mods", no_argument, NULL, 'M'},
{"output-BP", no_argument, NULL, 'O'},
{"output-bp", no_argument, NULL, 'O'},
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2021 Genome Research Ltd.
+ Copyright (C) 2008-2022 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "sam_opts.h"
#include "samtools.h"
#include "bedidx.h"
+#include "bam.h"
+
+
+// Struct which contains the sorting key for TemplateCoordinate sort.
+typedef struct {
+ int tid1;
+ int tid2;
+ hts_pos_t pos1;
+ hts_pos_t pos2;
+ bool neg1;
+ bool neg2;
+ const char *library;
+ char *mid;
+ char *name;
+ bool is_upper_of_pair;
+} template_coordinate_key_t;
+
+// Struct to store fixed buffers of template coordinate keys
+typedef struct {
+ size_t n; // the # of keys stored
+ size_t m; // the # of buffers allocated
+ size_t buffer_size; // # the fixed size of each buffer
+ template_coordinate_key_t **buffers; // the list of buffers
+} template_coordinate_keys_t;
+
+// Gets the idx'th key; does not OOB check
+static template_coordinate_key_t* template_coordinate_keys_get(template_coordinate_keys_t *keys, size_t idx) {
+ size_t buffer_idx = idx / keys->buffer_size; // the index of the buffer to retrieve in buffer
+ size_t buffer_offset = idx % keys->buffer_size; // the offset into the given buffer to retrieve
+ //assert(buffer_idx < keys->m);
+ //assert(buffer_offset < keys->buffer_size);
+ return &keys->buffers[buffer_idx][buffer_offset];
+}
+
+// Rellocates the buffers to hold at least max_k entries
+static int template_coordinate_keys_realloc(template_coordinate_keys_t *keys, int max_k) {
+ size_t cur_m = keys->m;
+ keys->m += 0x100;
+ //assert(keys->m > cur_m);
+ //assert(keys->m * keys->buffer_size >= max_k);
+ if ((keys->buffers = realloc(keys->buffers, keys->m * sizeof(template_coordinate_key_t*))) == NULL) {
+ print_error("sort", "couldn't reallocate memory for template coordinate key buffers");
+ return -1;
+ }
+ // allocate space for new buffers
+ int j;
+ for (j = cur_m; j < keys->m; ++j) {
+ if ((keys->buffers[j]= malloc(sizeof(template_coordinate_key_t) * keys->buffer_size)) == NULL) {
+ print_error("sort", "couldn't allocate memory for template coordinate key buffer");
+ return -1;
+ }
+ }
+ return 0;
+}
// Struct which contains the a record, and the pointer to the sort tag (if any) or
// a combined ref / position / strand.
-// Used to speed up tag and position sorts.
+// Used to speed up sorts (coordinate, by-tag, and template-coordinate).
typedef struct bam1_tag {
bam1_t *bam_record;
union {
const uint8_t *tag;
uint8_t pos_tid[12];
+ template_coordinate_key_t *key;
} u;
} bam1_tag;
KHASH_INIT(c2c, char*, char*, 1, kh_str_hash_func, kh_str_hash_equal)
KHASH_INIT(cset, char*, char, 0, kh_str_hash_func, kh_str_hash_equal)
KHASH_MAP_INIT_STR(c2i, int)
+KHASH_MAP_INIT_STR(const_c2c, char *)
#define hdrln_free_char(p)
KLIST_INIT(hdrln, char*, hdrln_free_char)
-static int g_is_by_qname = 0;
-static int g_is_by_tag = 0;
-static int g_is_by_minhash = 0;
+static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup);
+
+typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder;
+static SamOrder g_sam_order = Coordinate;
static char g_sort_tag[2] = {0,0};
static int strnum_cmp(const char *_a, const char *_b)
static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b);
+static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b);
+static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header);
+static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup);
// Function to compare reads in the heap and determine which one is < the other
// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this
if (!b.entry.bam_record)
return 0;
- if (g_is_by_tag) {
- int t;
- t = bam1_cmp_by_tag(a.entry, b.entry);
- if (t != 0) return t > 0;
- } else if (g_is_by_minhash) {
- int t = bam1_cmp_by_minhash(a.entry, b.entry);
- if (t != 0) return t > 0;
- } else if (g_is_by_qname) {
- int t, fa, fb;
- t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
- if (t != 0) return t > 0;
- fa = a.entry.bam_record->core.flag & 0xc0;
- fb = b.entry.bam_record->core.flag & 0xc0;
- if (fa != fb) return fa > fb;
- } else {
- if (a.tid != b.tid) return a.tid > b.tid;
- if (a.pos != b.pos) return a.pos > b.pos;
- if (a.rev != b.rev) return a.rev > b.rev;
+ int t, fa, fb;
+ switch (g_sam_order) {
+ case Coordinate:
+ if (a.tid != b.tid) return a.tid > b.tid;
+ if (a.pos != b.pos) return a.pos > b.pos;
+ if (a.rev != b.rev) return a.rev > b.rev;
+ break;
+ case QueryName:
+ t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
+ if (t != 0) return t > 0;
+ fa = a.entry.bam_record->core.flag & 0xc0;
+ fb = b.entry.bam_record->core.flag & 0xc0;
+ if (fa != fb) return fa > fb;
+ break;
+ case TagQueryName:
+ case TagCoordinate:
+ t = bam1_cmp_by_tag(a.entry, b.entry);
+ if (t != 0) return t > 0;
+ break;
+ case MinHash:
+ t = bam1_cmp_by_minhash(a.entry, b.entry);
+ if (t != 0) return t > 0;
+ break;
+ case TemplateCoordinate:
+ t = bam1_cmp_template_coordinate(a.entry, b.entry);
+ if (t != 0) return t > 0;
+ break;
+ default:
+ print_error("heap_lt", "unknown sort order: %d", g_sam_order);
+ break;
}
+
// This compares by position in the input file(s)
if (a.i != b.i) return a.i > b.i;
return a.idx > b.idx;
}
// If there are no RG lines in the file and we are overriding add one
- if (is_rg && override && kl_begin(hdr_lines) == NULL) {
+ if (is_rg && override && hdr_lines->size == 0) {
kstring_t new_id = {0, 0, NULL};
kstring_t line = {0, 0, NULL};
kstring_t empty = {0, 0, NULL};
/*!
@abstract Merge multiple sorted BAM.
- @param by_qname whether to sort by query name
- @param sort_tag if non-null, sort by the given tag
+ @param sam_order the order in which the data was sorted
+ @param sort_tag if non-null, the tag that data was sorted by
@param out output BAM file name
@param mode sam_open() mode to be used to create the final output file
(overrides level settings from UNCOMP and LEVEL1 flags)
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
-int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
+int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const char *mode,
const char *headers, int n, char * const *fn, char * const *fn_idx,
const char *fn_bed, int flag, const char *reg, int n_threads,
const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt,
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
refs_t *refs = NULL;
+ template_coordinate_keys_t *keys = NULL;
+ khash_t(const_c2c) *lib_lookup = NULL;
// Is there a specified pre-prepared header to use for output?
if (headers) {
}
}
- g_is_by_qname = by_qname;
- if (sort_tag) {
- g_is_by_tag = 1;
+ g_sam_order = sam_order;
+ if (sam_order == TagQueryName || sam_order == TagCoordinate) {
g_sort_tag[0] = sort_tag[0];
g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0';
}
hdr[i] = hin;
int order_ok = 1;
- if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
+ if ((translation_tbl+i)->lost_coord_sort && (sam_order == Coordinate || sam_order == MinHash)) {
fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
order_ok = 0;
}
rtrans = NULL;
}
+ // Make sure that there's enough memory for template coordinate keys, one per file to read
+ if (sam_order == TemplateCoordinate) {
+ if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) {
+ print_error("sort", "could not allocate memory for the top-level keys");
+ goto mem_fail;
+ }
+ keys->n = 0;
+ keys->m = 0;
+ keys->buffer_size = 0x10000;
+ keys->buffers = NULL;
+ // Make sure that there's enough memory for template coordinate keys, one per file to read
+ if (keys->n + n >= keys->m * keys->buffer_size) {
+ if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail;
+ }
+ lib_lookup = lookup_libraries(hout);
+ if (!lib_lookup) {
+ goto mem_fail;
+ }
+ }
+
// Load the first read from each file into the heap
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1);
h->rev = bam_is_rev(h->entry.bam_record);
h->idx = idx++;
- if (g_is_by_tag) {
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
+ } else if (g_sam_order == TemplateCoordinate) {
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, i); // get the next key to use
+ h->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
+ if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
} else {
h->entry.u.tag = NULL;
}
bam_destroy1(h->entry.bam_record);
h->entry.bam_record = NULL;
h->entry.u.tag = NULL;
+ h->entry.u.key = NULL;
} else {
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
heap->pos = (uint64_t)(b->core.pos + 1);
heap->rev = bam_is_rev(b);
heap->idx = idx++;
- if (g_is_by_tag) {
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
+ } else if (g_sam_order == TemplateCoordinate) {
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, heap->i); // get the next key to use
+ heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
+ if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
} else {
heap->entry.u.tag = NULL;
}
free(fp);
free(rtrans);
free(out_idx_fn);
+ if (keys != NULL) {
+ for (i = 0; i < keys->m; ++i) {
+ free(keys->buffers[i]);
+ }
+ free(keys->buffers);
+ free(keys);
+ }
+ lib_lookup_destroy(lib_lookup);
return -1;
}
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
+ SamOrder sam_order = by_qname ? QueryName : Coordinate;
+ return bam_merge_core2(sam_order, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
}
static void merge_usage(FILE *to)
" -b FILE List of input BAM filenames, one per line [null]\n"
" -X Use customized index files\n"
" -L FILE Specify a BED file for multiple region filtering [null]\n"
-" --no-PG do not add a PG line\n");
+" --no-PG do not add a PG line\n"
+" --template-coordinate Input files are sorted by template-coordinate\n");
sam_global_opt_help(to, "-.O..@..");
}
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0;
+ int c, flag = 0, ret = 0, level = -1, has_index_file = 0;
char *fn_headers = NULL, *reg = NULL, mode[12];
char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL;
long random_seed = (long)time(NULL);
char** fn = NULL;
char** fn_idx = NULL, *fn_bed = NULL;
int fn_size = 0, no_pg = 0;
+ SamOrder sam_order = Coordinate;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{"no-PG", no_argument, NULL, 1},
+ { "template-coordinate", no_argument, NULL, 2},
{ NULL, 0, NULL, 0 }
};
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
case 'h': fn_headers = optarg; break;
- case 'n': is_by_qname = 1; break;
+ case 'n': sam_order = QueryName; break;
case 'o': fnout = optarg; break;
case 't': sort_tag = optarg; break;
case '1': flag |= MERGE_LEVEL1; level = 1; break;
break;
}
case 1: no_pg = 1; break;
+ case 2: sam_order = TemplateCoordinate; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': merge_usage(stderr); return 1;
}
}
+ if (sort_tag != NULL) {
+ sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
+ }
+
if (fnout == NULL && argc - optind >= 1) {
fnout = argv[optind];
optind++;
strcpy(mode, "wb");
sam_open_mode(mode+1, fnout, NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
- if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers,
+ if (bam_merge_core2(sam_order, sort_tag, fnout, mode, fn_headers,
fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads,
"merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0)
ret = 1;
* BAM sorting *
***************/
+
typedef struct {
size_t from;
size_t to;
static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp,
int num_in_mem, buf_region *in_mem,
- bam1_tag *buf, uint64_t *idx, sam_hdr_t *hout) {
+ bam1_tag *buf, template_coordinate_keys_t *keys,
+ uint64_t *idx, sam_hdr_t *hout,
+ khash_t(const_c2c) *lib_lookup) {
int i = heap->i, res;
if (i < nfiles) { // read from file
res = sam_read1(fp[i], hout, heap->entry.bam_record);
+ if (res >= 0 && g_sam_order == TemplateCoordinate) { // file read OK and TemplateCoordinate order
+ // It is assumed that there are nfiles more keys allocated than keys->n; see allocation in bam_merge_simple
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, keys->n + i); // get the next key to use
+ heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
+ if (heap->entry.u.key == NULL) res = -1; // key could not be created, error out
+ }
} else { // read from memory
if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) {
- heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record;
+ size_t from = in_mem[i - nfiles].from;
+ heap->entry.bam_record = buf[from].bam_record;
+ if (g_sam_order == TemplateCoordinate) heap->entry.u.key = buf[from].u.key;
+ in_mem[i - nfiles].from++;
res = 0;
} else {
res = -1;
heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1);
heap->rev = bam_is_rev(heap->entry.bam_record);
heap->idx = (*idx)++;
- if (g_is_by_tag) {
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
- } else {
+ } else if (g_sam_order != TemplateCoordinate) {
heap->entry.u.tag = NULL;
+ heap->entry.u.key = NULL;
}
} else if (res == -1) {
heap->pos = HEAP_EMPTY;
if (i < nfiles) bam_destroy1(heap->entry.bam_record);
heap->entry.bam_record = NULL;
heap->entry.u.tag = NULL;
+ heap->entry.u.key = NULL;
} else {
return -1;
}
return 0;
}
-static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
+static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
const char *mode, sam_hdr_t *hout,
int n, char * const *fn, int num_in_mem,
- buf_region *in_mem, bam1_tag *buf, int n_threads,
+ buf_region *in_mem, bam1_tag *buf,
+ template_coordinate_keys_t *keys,
+ khash_t(const_c2c) *lib_lookup, int n_threads,
const char *cmd, const htsFormat *in_fmt,
const htsFormat *out_fmt, char *arg_list, int no_pg,
int write_index) {
int i, heap_size = n + num_in_mem;
char *out_idx_fn = NULL;
- g_is_by_qname = by_qname;
- if (sort_tag) {
- g_is_by_tag = 1;
+ if (sam_order == TagQueryName || sam_order == TagCoordinate) {
g_sort_tag[0] = sort_tag[0];
g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0';
}
heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t));
if (!heap) goto mem_fail;
+ // Make sure that there's enough memory for template coordinate keys, one per file to read
+ if (keys && keys->n + n >= keys->m * keys->buffer_size) {
+ if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail;
+ }
+
// Open each file, read the header and put the first read into the heap
for (i = 0; i < heap_size; i++) {
sam_hdr_t *hin;
// Get a read into the heap
h->i = i;
h->entry.u.tag = NULL;
+ h->entry.u.key = NULL;
if (i < n) {
h->entry.bam_record = bam_init1();
if (!h->entry.bam_record) goto mem_fail;
}
- if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, keys, &idx, hout,
+ lib_lookup) < 0) {
assert(i < n);
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
ks_heapmake(heap, heap_size, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->entry.bam_record;
- if (g_is_by_minhash && b->core.tid == -1) {
+ if (g_sam_order == MinHash && b->core.tid == -1) {
// Remove the cached minhash value
b->core.pos = -1;
b->core.mpos = -1;
print_error_errno(cmd, "failed writing to \"%s\"", out);
goto fail;
}
- if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, keys, &idx,
+ hout, lib_lookup) < 0) {
assert(heap->i < n);
print_error(cmd, "Error reading \"%s\" : %s",
fn[heap->i], strerror(errno));
static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
{
uint64_t pa, pb;
- if (!a.bam_record)
- return 1;
- if (!b.bam_record)
- return 0;
+ if (!a.bam_record) return 1;
+ if (!b.bam_record) return 0;
- if (g_is_by_qname) {
+ if (g_sam_order == QueryName || g_sam_order == TagQueryName) {
int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record));
if (t != 0) return t;
return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
if (!A) return 1;
if (!B) return 0;
- if (A->core.tid != -1 || B->core.tid != -1)
- return bam1_cmp_core(a,b);
+ if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b);
const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
return bam1_cmp_core(a,b);
}
+// compares to molecular identifiers, ignoring any trailing slash and subsequent single-character
+// * if mid1 is less than mid2, then -1 will be returned
+// * if mid1 is greater than mid2, then 1 will be returned
+static inline int template_coordinate_key_compare_mid(const char* mid1, const char* mid2) {
+ size_t i = 0;
+ size_t len1 = strlen(mid1);
+ size_t len2 = strlen(mid2);
+ size_t shortest;
+
+ // Snip off trailing slash followed by a single character, if present
+ if (len1 >= 2 && mid1[len1-2] == '/') len1 -= 2;
+ if (len2 >= 2 && mid2[len2-2] == '/') len2 -= 2;
+ shortest = len1 < len2 ? len1 : len2;
+
+ // find first mismatching character
+ while (i < shortest && mid1[i] == mid2[i]) i++;
+
+ // compare last characters
+ if (i == len1 && i < len2) return -1; // mid1 shorter
+ if (i == len2 && i < len1) return 1; // mid2 shorter
+ if (i == len1 && i == len2) return 0; // all characters match
+ if (mid1[i] < mid2[i]) return -1; // mid1 earlier
+ else return 1;
+}
+
+
+// Builds a key use to sort in TemplateCoordinate order. Returns NULL if the key could not be created (e.g. MC
+// tag is missing), otherwise the pointer to the provided key.
+static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup) {
+ uint8_t *data;
+ char *rg;
+ khiter_t k;
+
+ // defaults
+ key->tid1 = key->tid2 = INT32_MAX;
+ key->pos1 = key->pos2 = HTS_POS_MAX;
+ key->neg1 = key->neg2 = false;
+ key->mid = "";
+
+ // update values
+ rg = (char *)bam_aux_get(b, "RG");
+ if (rg && rg[0] == 'Z'
+ &&(k = kh_get(const_c2c, lib_lookup, rg + 1)) < kh_end(lib_lookup)) {
+ key->library = kh_value(lib_lookup, k);
+ } else {
+ key->library = "";
+ }
+ key->name = bam_get_qname(b);
+ if (!(b->core.flag & BAM_FUNMAP)) { // read is mapped, update coordinates
+ key->tid1 = b->core.tid;
+ key->neg1 = bam_is_rev(b);
+ key->pos1 = (key->neg1) ? unclipped_end(b) : unclipped_start(b);
+ }
+ if (b->core.flag & BAM_FPAIRED && !(b->core.flag & BAM_FMUNMAP)) { // mate is mapped, update coordinates
+ char *cigar;
+ if ((data = bam_aux_get(b, "MC"))) {
+ if (!(cigar = bam_aux2Z(data))) {
+ fprintf(stderr, "[bam_sort] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
+ return NULL;
+ }
+ } else {
+ fprintf(stderr, "[bam_sort] error: no MC tag. Please run samtools fixmate on file first.\n");
+ return NULL;
+ }
+ key->tid2 = b->core.mtid;
+ key->neg2 = bam_is_mrev(b);
+ key->pos2 = (key->neg2) ? unclipped_other_end(b->core.mpos, cigar) : unclipped_other_start(b->core.mpos, cigar);
+ }
+
+ if ((data = bam_aux_get(b, "MI"))) {
+ if (!(key->mid=bam_aux2Z(data))) {
+ fprintf(stderr, "[bam_sort] error: MI tag wrong type (not a string).\n");
+ return NULL;
+ }
+ }
+
+ // set is_upper_of_pair, and swap if we get the same key regardless of which end
+ // of the pair it is
+ if (key->tid1 < key->tid2
+ || (key->tid1 == key->tid2 && key->pos1 < key->pos2)
+ || (key->tid1 == key->tid2 && key->pos1 == key->pos2 && !key->neg1)) {
+ key->is_upper_of_pair = false;
+ } else {
+ key->is_upper_of_pair = true;
+ // swap
+ int tmp_tid;
+ hts_pos_t tmp_pos;
+ bool tmp_neg;
+ tmp_tid = key->tid1;
+ key->tid1 = key->tid2;
+ key->tid2 = tmp_tid;
+ tmp_pos = key->pos1;
+ key->pos1 = key->pos2;
+ key->pos2 = tmp_pos;
+ tmp_neg = key->neg1;
+ key->neg1 = key->neg2;
+ key->neg2 = tmp_neg;
+ }
+
+ return key;
+}
+
+// Function to compare reads and determine which one is < or > the other
+// Handles template-coordinate, which sorts by:
+// 1. the earlier unclipped 5' coordinate of the read pair
+// 2. the higher unclipped 5' coordinate of the read pair
+// 3. library (from read group)
+// 4. the molecular identifier (if present)
+// 5. read name
+// 6. if unpaired, or if R1 has the lower coordinates of the pair
+// Returns a value less than, equal to or greater than zero if a is less than,
+// equal to or greater than b, respectively.
+static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b)
+{
+ if (!a.bam_record) return 1;
+ if (!b.bam_record) return 0;
+
+ const template_coordinate_key_t* key_a = a.u.key;
+ const template_coordinate_key_t* key_b = b.u.key;
+
+ int retval = 0;
+ if (0 == retval) retval = key_a->tid1 - key_b->tid1;
+ if (0 == retval) retval = key_a->tid2 - key_b->tid2;
+ if (0 == retval) retval = key_a->pos1 < key_b->pos1 ? -1 : (key_a->pos1 > key_b->pos1 ? 1 : 0);
+ if (0 == retval) retval = key_a->pos2 < key_b->pos2 ? -1 : (key_a->pos2 > key_b->pos2 ? 1 : 0);
+ if (0 == retval) retval = key_a->neg1 == key_b->neg1 ? 0 : (key_a->neg1 ? -1 : 1);
+ if (0 == retval) retval = key_a->neg2 == key_b->neg2 ? 0 : (key_a->neg2 ? -1 : 1);
+ if (0 == retval) retval = strcmp(key_a->library, key_b->library);
+ if (0 == retval) retval = template_coordinate_key_compare_mid(key_a->mid, key_b->mid);
+ if (0 == retval) retval = strcmp(key_a->name, key_b->name);
+ if (0 == retval) retval = key_a->is_upper_of_pair == key_b->is_upper_of_pair ? 0 : (key_a->is_upper_of_pair ? 1 : -1);
+ return retval < 0 ? -1 : (retval > 0 ? 1 : 0);
+}
+
+
// Function to compare reads and determine which one is < the other
-// Handle sort-by-pos, sort-by-name, or sort-by-tag
+// Handle sort-by-pos, sort-by-name, sort-by-tag, or sort-by-template-coordinate.
static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
{
- if (g_is_by_tag) {
- return bam1_cmp_by_tag(a, b) < 0;
- } else if (g_is_by_minhash) {
- return bam1_cmp_by_minhash(a, b) < 0;
- } else {
- return bam1_cmp_core(a,b) < 0;
+ switch (g_sam_order) {
+ case Coordinate:
+ case QueryName:
+ return bam1_cmp_core(a, b) < 0;
+ case TagQueryName:
+ case TagCoordinate:
+ return bam1_cmp_by_tag(a, b) < 0;
+ case MinHash:
+ return bam1_cmp_by_minhash(a, b) < 0;
+ case TemplateCoordinate:
+ return bam1_cmp_template_coordinate(a, b) < 0;
+ default:
+ return bam1_cmp_core(a,b) < 0;
}
}
int error;
int no_save;
int large_pos;
+ int minimiser_kmer;
} worker_t;
// Returns 0 for success
}
//--- End of candidates to punt to htslib
+
+static inline void worker_minhash(worker_t *w) {
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ bam1_t *b = w->buf[i].bam_record;
+ if (b->core.tid != -1)
+ continue;
+
+ int pos = 0, rev = 0;
+ uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev);
+ if (rev)
+ reverse_complement(b);
+
+ // Store 64-bit hash in unmapped pos and mpos fields.
+ // The position of hash is in isize, which we use for
+ // resolving ties when sorting by hash key.
+ // These are unused for completely unmapped data and
+ // will be reset during final output.
+ b->core.pos = mh>>31;
+ b->core.mpos = mh&0x7fffffff;
+ b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ }
+}
+
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
w->error = 0;
w->tmpfile_name = NULL;
- if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
- if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
- w->error = errno;
- return NULL;
- }
- } else {
- if (g_is_by_minhash) {
- int i;
- for (i = 0; i < w->buf_len; i++) {
- bam1_t *b = w->buf[i].bam_record;
- if (b->core.tid != -1)
- continue;
-
- int pos = 0, rev = 0;
- uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev);
- if (rev)
- reverse_complement(b);
-
- // Store 64-bit hash in unmapped pos and mpos fields.
- // The position of hash is in isize, which we use for
- // resolving ties when sorting by hash key.
- // These are unused for completely unmapped data and
- // will be reset during final output.
- b->core.pos = mh>>31;
- b->core.mpos = mh&0x7fffffff;
- b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ switch (g_sam_order) {
+ case Coordinate:
+ if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
+ w->error = errno;
+ return NULL;
}
- }
- ks_mergesort(sort, w->buf_len, w->buf, 0);
+ break;
+ case MinHash:
+ worker_minhash(w);
+ // no break, go to merge sort
+ default:
+ ks_mergesort(sort, w->buf_len, w->buf, 0);
}
if (w->no_save)
static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
const sam_hdr_t *h, int n_threads, buf_region *in_mem,
- int large_pos, char **fns, size_t fns_size)
+ int large_pos, int minimiser_kmer, char **fns, size_t fns_size)
{
int i;
size_t pos, rest;
w[i].index = n_files + i;
w[i].tmpfile_name = NULL;
w[i].large_pos = large_pos;
+ w[i].minimiser_kmer = minimiser_kmer;
if (in_mem) {
w[i].no_save = 1;
in_mem[i].from = pos;
return n_files + n_threads;
}
+static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) {
+ khiter_t k;
+ if (lib_lookup == NULL)
+ return;
+ for (k = kh_begin(lib_lookup); k < kh_end(lib_lookup); k++) {
+ if (kh_exist(lib_lookup, k))
+ free(kh_value(lib_lookup, k));
+ }
+ kh_destroy(const_c2c, lib_lookup);
+}
+
+// Build an RG to LB lookup table, for the template coordinate sort.
+// Returns a populated hash table (which may be empty) on success;
+// NULL on failure.
+static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header)
+{
+ khash_t(const_c2c) *lib_lookup = kh_init(const_c2c);
+ kstring_t lib_name = KS_INITIALIZE;
+ int num_rg, i, res;
+ if (!lib_lookup)
+ return NULL;
+
+ // Iterate through any RG lines and look for library information
+ num_rg = sam_hdr_count_lines(header, "RG");
+ if (num_rg < 0)
+ goto fail;
+
+ for (i = 0; i < num_rg; i++) {
+ const char *rg_id = sam_hdr_line_name(header, "RG", i);
+ khiter_t k;
+ if (!rg_id)
+ goto fail;
+ res = sam_hdr_find_tag_pos(header, "RG", i, "LB", &lib_name);
+ if (res < -1) // Error
+ goto fail;
+ if (res < 0 || !lib_name.s) // No LB tag
+ continue;
+ // Add to lookup table
+ k = kh_put(const_c2c, lib_lookup, rg_id, &res);
+ if (res < 0) // Error
+ goto fail;
+ if (res > 0) { // Inserted
+ kh_value(lib_lookup, k) = ks_release(&lib_name);
+ }
+ }
+
+ free(lib_name.s);
+
+ return lib_lookup;
+
+ fail:
+ lib_lookup_destroy(lib_lookup);
+ free(lib_name.s);
+ return NULL;
+}
/*!
- @abstract Sort an unsorted BAM file based on the chromosome order
- and the leftmost position of an alignment
+ @abstract Sort an unsorted BAM file based on the provided sort order
- @param is_by_qname whether to sort by query name
- @param sort_by_tag if non-null, sort by the given tag
+ @param sam_order the order in which the sort should occur
+ @param sort_tag the tag to use if sorting by Tag
+ @param minimiser_kmer the kmer size when sorting by MinHash
@param fn name of the file to be sorted
@param prefix prefix of the temporary files (prefix.NNNN.bam are written)
@param fnout name of the final output file to be written
and then merge them by calling bam_merge_simple(). This function is
NOT thread safe.
*/
-int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
+int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
+ const char *fn, const char *prefix,
const char *fnout, const char *modeout,
- size_t _max_mem, int by_minimiser, int n_threads,
+ size_t _max_mem, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
int ret = -1, res, i, nref, n_files = 0;
size_t max_k, k, max_mem, bam_mem_offset;
sam_hdr_t *header = NULL;
- samFile *fp;
+ samFile *fp = NULL;
bam1_tag *buf = NULL;
+ template_coordinate_keys_t *keys = NULL;
bam1_t *b = bam_init1();
uint8_t *bam_mem = NULL;
char **fns = NULL;
size_t fns_size = 0;
- const char *new_so;
+ const char *new_so = NULL;
+ const char *new_go = NULL;
+ const char *new_ss = NULL;
buf_region *in_mem = NULL;
+ khash_t(const_c2c) *lib_lookup = NULL;
int num_in_mem = 0;
int large_pos = 0;
}
if (n_threads < 2) n_threads = 1;
- g_is_by_qname = is_by_qname;
- g_is_by_minhash = by_minimiser;
- if (sort_by_tag) {
- g_is_by_tag = 1;
- g_sort_tag[0] = sort_by_tag[0];
- g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0';
+ g_sam_order = sam_order;
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
+ g_sort_tag[0] = sort_tag[0];
+ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0';
+ }
+
+ if (sam_order == TemplateCoordinate) {
+ if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) {
+ print_error("sort", "could not allocate memory for the top-level keys");
+ goto err;
+ }
+ keys->n = 0;
+ keys->m = 0;
+ keys->buffer_size = 0x10000;
+ keys->buffers = NULL;
}
max_mem = _max_mem * n_threads;
}
}
- if (sort_by_tag != NULL)
- new_so = "unknown";
- else if (is_by_qname)
- new_so = "queryname";
- else
- new_so = "coordinate";
+ if (g_sam_order == TemplateCoordinate) {
+ lib_lookup = lookup_libraries(header);
+ if (!lib_lookup)
+ goto err;
+ }
- if (by_minimiser) {
- const char *new_ss = "coordinate:minhash";
+ switch (g_sam_order) {
+ case Coordinate:
+ new_so = "coordinate";
+ break;
+ case QueryName:
+ new_so = "queryname";
+ break;
+ case MinHash:
+ new_so = "coordinate";
+ new_ss = "coordinate:minhash";
+ break;
+ case TagQueryName:
+ case TagCoordinate:
+ new_so = "unknown";
+ break;
+ case TemplateCoordinate:
+ new_so = "unsorted";
+ new_go = "query";
+ new_ss = "unsorted:template-coordinate";
+ break;
+ default:
+ new_so = "unknown";
+ break;
+ }
+
+ if (new_ss == NULL && new_go == NULL) { // just SO
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+ goto err;
+ }
+ } else if (new_ss != NULL && new_go == NULL) { // update SO and SS, but not GO
if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss))
&& (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
"SO", new_so, "SS", new_ss, NULL))
new_so, new_ss);
goto err;
}
- } else {
- if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
- && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+ } else if (new_ss == NULL && new_go != NULL) { // update SO and GO, but not SS
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+ "SO", new_so, "GO", new_go, NULL))
) {
- print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+ print_error("sort", "failed to change sort order header to 'SO:%s GO:%s'\n",
+ new_so, new_go);
+ goto err;
+ }
+ } else { // update SO, GO, and SS
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go, "SS", new_ss))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+ "SO", new_so, "GO", new_go, "SS", new_ss, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s GO:%s SS:%s'\n",
+ new_so, new_go, new_ss);
goto err;
}
}
- if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
- print_error("sort", "failed to delete group order header\n");
- goto err;
+ if (new_go == NULL) {
+ if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
+ print_error("sort", "failed to delete group order in header\n");
+ goto err;
+ }
+ }
+ if (new_ss == NULL) {
+ if (-1 == sam_hdr_remove_tag_hd(header, "SS")) {
+ print_error("sort", "failed to delete sub sort in header\n");
+ goto err;
+ }
}
// No gain to using the thread pool here as the flow of this code
}
buf = new_buf;
}
+ if (sam_order == TemplateCoordinate && k >= keys->m * keys->buffer_size) {
+ if (template_coordinate_keys_realloc(keys, k + 1) == -1) {
+ goto err;
+ }
+ }
// Check if the BAM record will fit in the memory limit
if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) {
mem_full = 1;
}
- // Pull out the value of the position
- // or the pointer to the sort tag if applicable
- if (g_is_by_tag) {
- buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
- } else {
- buf[k].u.tag = NULL;
+ // Set the tag if sorting by tag, or the key for template cooridinate sorting
+ switch (g_sam_order) {
+ case TagQueryName:
+ case TagCoordinate:
+ buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
+ break;
+ case TemplateCoordinate:
+ ++keys->n;
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, k);
+ buf[k].u.key = template_coordinate_key(buf[k].bam_record, key, header, lib_lookup);
+ if (buf[k].u.key == NULL) goto err;
+ break;
+ default:
+ buf[k].u.tag = NULL;
+ buf[k].u.key = NULL;
}
++k;
&fns_size, &fns, 0) < 0)
goto err;
int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- NULL, large_pos, fns, fns_size);
+ NULL, large_pos, minimiser_kmer, fns, fns_size);
if (new_n < 0) {
goto err;
} else {
n_files = new_n;
}
k = 0;
+ if (keys != NULL) keys->n = 0;
bam_mem_offset = 0;
}
}
in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
if (!in_mem) goto err;
num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- in_mem, large_pos, fns, fns_size);
+ in_mem, large_pos, minimiser_kmer, fns, fns_size);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
// write the final output
if (n_files == 0 && num_in_mem < 2) { // a single block
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt,
- g_is_by_minhash, arg_list, no_pg, write_index) != 0) {
+ minimiser_kmer, arg_list, no_pg, write_index) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
goto err;
}
abort();
}
}
- if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header,
- n_files, fns, num_in_mem, in_mem, buf,
- n_threads, "sort", in_fmt, out_fmt, arg_list,
- no_pg, write_index) < 0) {
+ char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL;
+ if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
+ n_files, fns, num_in_mem, in_mem, buf, keys,
+ lib_lookup, n_threads, "sort", in_fmt, out_fmt,
+ arg_list, no_pg, write_index) < 0) {
// Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
}
bam_destroy1(b);
free(buf);
+ if (keys != NULL) {
+ for (i = 0; i < keys->m; ++i) {
+ free(keys->buffers[i]);
+ }
+ free(keys->buffers);
+ free(keys);
+ }
free(bam_mem);
free(in_mem);
+ lib_lookup_destroy(lib_lookup);
sam_hdr_destroy(header);
if (fp) sam_close(fp);
return ret;
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
if (!fnout) return -1;
sprintf(fnout, "%s.bam", prefix);
- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0);
+ SamOrder sam_order = is_by_qname ? QueryName : Coordinate;
+ g_sam_order = sam_order;
+ ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
free(fnout);
return ret;
}
" -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
" -o FILE Write final output to FILE rather than standard output\n"
" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
-" --no-PG do not add a PG line\n");
+" --no-PG\n"
+" Do not add a PG line\n"
+" --template-coordinate\n"
+" Sort by template-coordinate\n");
sam_global_opt_help(fp, "-.O..@..");
}
int bam_sort(int argc, char *argv[])
{
size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
- int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0;
- int by_minimiser = 0, minimiser_kmer = 20;
+ int c, nargs, ret, o_seen = 0, level = -1, no_pg = 0;
+ SamOrder sam_order = Coordinate;
+ bool by_tag = false;
+ int minimiser_kmer = 20;
char* sort_tag = NULL, *arg_list = NULL;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{"no-PG", no_argument, NULL, 1},
+ { "template-coordinate", no_argument, NULL, 2},
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
switch (c) {
case 'o': fnout = optarg; o_seen = 1; break;
- case 'n': is_by_qname = 1; break;
- case 't': sort_tag = optarg; break;
+ case 'n': sam_order = QueryName; break;
+ case 't': by_tag = true; sort_tag = optarg; break;
case 'm': {
char *q;
max_mem = strtol(optarg, &q, 0);
case 'l': level = atoi(optarg); break;
case 'u': level = 0; break;
case 1: no_pg = 1; break;
- case 'M': by_minimiser = 1; break;
+ case 2: sam_order = TemplateCoordinate; break;
+ case 'M': sam_order = MinHash; break;
case 'K':
minimiser_kmer = atoi(optarg);
if (minimiser_kmer < 1)
}
}
+ // Change sort order if tag sorting is requested. Must update based on secondary index
+ if (by_tag) {
+ sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
+ }
+
nargs = argc - optind;
if (nargs == 0 && isatty(STDIN_FILENO)) {
sort_usage(stdout);
goto sort_end;
}
- if (ga.write_index && (is_by_qname || sort_tag)) {
+ if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) {
fprintf(stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n");
ga.write_index = 0;
}
ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
}
- ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem,
- by_minimiser * minimiser_kmer, ga.nthreads,
+ ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0,
+ (nargs > 0) ? argv[optind] : "-",
+ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out, arg_list, no_pg, ga.write_index);
if (ret >= 0)
ret = EXIT_SUCCESS;
/* bam_sort.c -- sorting and merging.
- Copyright (C) 2008-2021 Genome Research Ltd.
+ Copyright (C) 2008-2022 Genome Research Ltd.
Portions copyright (C) 2009-2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include "sam_opts.h"
#include "samtools.h"
#include "bedidx.h"
+#include "bam.h"
+
+
+// Struct which contains the sorting key for TemplateCoordinate sort.
+typedef struct {
+ int tid1;
+ int tid2;
+ hts_pos_t pos1;
+ hts_pos_t pos2;
+ bool neg1;
+ bool neg2;
+ const char *library;
+ char *mid;
+ char *name;
+ bool is_upper_of_pair;
+} template_coordinate_key_t;
+
+// Struct to store fixed buffers of template coordinate keys
+typedef struct {
+ size_t n; // the # of keys stored
+ size_t m; // the # of buffers allocated
+ size_t buffer_size; // # the fixed size of each buffer
+ template_coordinate_key_t **buffers; // the list of buffers
+} template_coordinate_keys_t;
+
+// Gets the idx'th key; does not OOB check
+static template_coordinate_key_t* template_coordinate_keys_get(template_coordinate_keys_t *keys, size_t idx) {
+ size_t buffer_idx = idx / keys->buffer_size; // the index of the buffer to retrieve in buffer
+ size_t buffer_offset = idx % keys->buffer_size; // the offset into the given buffer to retrieve
+ //assert(buffer_idx < keys->m);
+ //assert(buffer_offset < keys->buffer_size);
+ return &keys->buffers[buffer_idx][buffer_offset];
+}
+
+// Rellocates the buffers to hold at least max_k entries
+static int template_coordinate_keys_realloc(template_coordinate_keys_t *keys, int max_k) {
+ size_t cur_m = keys->m;
+ keys->m += 0x100;
+ //assert(keys->m > cur_m);
+ //assert(keys->m * keys->buffer_size >= max_k);
+ if ((keys->buffers = realloc(keys->buffers, keys->m * sizeof(template_coordinate_key_t*))) == NULL) {
+ print_error("sort", "couldn't reallocate memory for template coordinate key buffers");
+ return -1;
+ }
+ // allocate space for new buffers
+ int j;
+ for (j = cur_m; j < keys->m; ++j) {
+ if ((keys->buffers[j]= malloc(sizeof(template_coordinate_key_t) * keys->buffer_size)) == NULL) {
+ print_error("sort", "couldn't allocate memory for template coordinate key buffer");
+ return -1;
+ }
+ }
+ return 0;
+}
// Struct which contains the a record, and the pointer to the sort tag (if any) or
// a combined ref / position / strand.
-// Used to speed up tag and position sorts.
+// Used to speed up sorts (coordinate, by-tag, and template-coordinate).
typedef struct bam1_tag {
bam1_t *bam_record;
union {
const uint8_t *tag;
uint8_t pos_tid[12];
+ template_coordinate_key_t *key;
} u;
} bam1_tag;
KHASH_INIT(c2c, char*, char*, 1, kh_str_hash_func, kh_str_hash_equal)
KHASH_INIT(cset, char*, char, 0, kh_str_hash_func, kh_str_hash_equal)
KHASH_MAP_INIT_STR(c2i, int)
+KHASH_MAP_INIT_STR(const_c2c, char *)
#define hdrln_free_char(p)
KLIST_INIT(hdrln, char*, hdrln_free_char)
-static int g_is_by_qname = 0;
-static int g_is_by_tag = 0;
-static int g_is_by_minhash = 0;
+static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup);
+
+typedef enum {Coordinate, QueryName, TagCoordinate, TagQueryName, MinHash, TemplateCoordinate} SamOrder;
+static SamOrder g_sam_order = Coordinate;
static char g_sort_tag[2] = {0,0};
static int strnum_cmp(const char *_a, const char *_b)
static inline int bam1_cmp_by_tag(const bam1_tag a, const bam1_tag b);
static inline int bam1_cmp_by_minhash(const bam1_tag a, const bam1_tag b);
+static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b);
+static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header);
+static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup);
// Function to compare reads in the heap and determine which one is < the other
// Note, unlike the bam1_cmp_by_X functions which return <0, 0, >0 this
if (!b.entry.bam_record)
return 0;
- if (g_is_by_tag) {
- int t;
- t = bam1_cmp_by_tag(a.entry, b.entry);
- if (t != 0) return t > 0;
- } else if (g_is_by_minhash) {
- int t = bam1_cmp_by_minhash(a.entry, b.entry);
- if (t != 0) return t > 0;
- } else if (g_is_by_qname) {
- int t, fa, fb;
- t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
- if (t != 0) return t > 0;
- fa = a.entry.bam_record->core.flag & 0xc0;
- fb = b.entry.bam_record->core.flag & 0xc0;
- if (fa != fb) return fa > fb;
- } else {
- if (a.tid != b.tid) return a.tid > b.tid;
- if (a.pos != b.pos) return a.pos > b.pos;
- if (a.rev != b.rev) return a.rev > b.rev;
+ int t, fa, fb;
+ switch (g_sam_order) {
+ case Coordinate:
+ if (a.tid != b.tid) return a.tid > b.tid;
+ if (a.pos != b.pos) return a.pos > b.pos;
+ if (a.rev != b.rev) return a.rev > b.rev;
+ break;
+ case QueryName:
+ t = strnum_cmp(bam_get_qname(a.entry.bam_record), bam_get_qname(b.entry.bam_record));
+ if (t != 0) return t > 0;
+ fa = a.entry.bam_record->core.flag & 0xc0;
+ fb = b.entry.bam_record->core.flag & 0xc0;
+ if (fa != fb) return fa > fb;
+ break;
+ case TagQueryName:
+ case TagCoordinate:
+ t = bam1_cmp_by_tag(a.entry, b.entry);
+ if (t != 0) return t > 0;
+ break;
+ case MinHash:
+ t = bam1_cmp_by_minhash(a.entry, b.entry);
+ if (t != 0) return t > 0;
+ break;
+ case TemplateCoordinate:
+ t = bam1_cmp_template_coordinate(a.entry, b.entry);
+ if (t != 0) return t > 0;
+ break;
+ default:
+ print_error("heap_lt", "unknown sort order: %d", g_sam_order);
+ break;
}
+
// This compares by position in the input file(s)
if (a.i != b.i) return a.i > b.i;
return a.idx > b.idx;
}
// If there are no RG lines in the file and we are overriding add one
- if (is_rg && override && kl_begin(hdr_lines) == NULL) {
+ if (is_rg && override && hdr_lines->size == 0) {
kstring_t new_id = {0, 0, NULL};
kstring_t line = {0, 0, NULL};
kstring_t empty = {0, 0, NULL};
/*!
@abstract Merge multiple sorted BAM.
- @param by_qname whether to sort by query name
- @param sort_tag if non-null, sort by the given tag
+ @param sam_order the order in which the data was sorted
+ @param sort_tag if non-null, the tag that data was sorted by
@param out output BAM file name
@param mode sam_open() mode to be used to create the final output file
(overrides level settings from UNCOMP and LEVEL1 flags)
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
-int bam_merge_core2(int by_qname, char* sort_tag, const char *out, const char *mode,
+int bam_merge_core2(SamOrder sam_order, char* sort_tag, const char *out, const char *mode,
const char *headers, int n, char * const *fn, char * const *fn_idx,
const char *fn_bed, int flag, const char *reg, int n_threads,
const char *cmd, const htsFormat *in_fmt, const htsFormat *out_fmt,
merged_header_t *merged_hdr = init_merged_header();
if (!merged_hdr) return -1;
refs_t *refs = NULL;
+ template_coordinate_keys_t *keys = NULL;
+ khash_t(const_c2c) *lib_lookup = NULL;
// Is there a specified pre-prepared header to use for output?
if (headers) {
}
}
- g_is_by_qname = by_qname;
- if (sort_tag) {
- g_is_by_tag = 1;
+ g_sam_order = sam_order;
+ if (sam_order == TagQueryName || sam_order == TagCoordinate) {
g_sort_tag[0] = sort_tag[0];
g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0';
}
hdr[i] = hin;
int order_ok = 1;
- if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
+ if ((translation_tbl+i)->lost_coord_sort && (sam_order == Coordinate || sam_order == MinHash)) {
fprintf(samtools_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
order_ok = 0;
}
rtrans = NULL;
}
+ // Make sure that there's enough memory for template coordinate keys, one per file to read
+ if (sam_order == TemplateCoordinate) {
+ if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) {
+ print_error("sort", "could not allocate memory for the top-level keys");
+ goto mem_fail;
+ }
+ keys->n = 0;
+ keys->m = 0;
+ keys->buffer_size = 0x10000;
+ keys->buffers = NULL;
+ // Make sure that there's enough memory for template coordinate keys, one per file to read
+ if (keys->n + n >= keys->m * keys->buffer_size) {
+ if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail;
+ }
+ lib_lookup = lookup_libraries(hout);
+ if (!lib_lookup) {
+ goto mem_fail;
+ }
+ }
+
// Load the first read from each file into the heap
for (i = 0; i < n; ++i) {
heap1_t *h = heap + i;
h->pos = (uint64_t)(h->entry.bam_record->core.pos + 1);
h->rev = bam_is_rev(h->entry.bam_record);
h->idx = idx++;
- if (g_is_by_tag) {
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
h->entry.u.tag = bam_aux_get(h->entry.bam_record, g_sort_tag);
+ } else if (g_sam_order == TemplateCoordinate) {
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, i); // get the next key to use
+ h->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
+ if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
} else {
h->entry.u.tag = NULL;
}
bam_destroy1(h->entry.bam_record);
h->entry.bam_record = NULL;
h->entry.u.tag = NULL;
+ h->entry.u.key = NULL;
} else {
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
heap->pos = (uint64_t)(b->core.pos + 1);
heap->rev = bam_is_rev(b);
heap->idx = idx++;
- if (g_is_by_tag) {
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
+ } else if (g_sam_order == TemplateCoordinate) {
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, heap->i); // get the next key to use
+ heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
+ if (heap->entry.u.key == NULL) goto mem_fail; // key could not be created, error out
} else {
heap->entry.u.tag = NULL;
}
free(fp);
free(rtrans);
free(out_idx_fn);
+ if (keys != NULL) {
+ for (i = 0; i < keys->m; ++i) {
+ free(keys->buffers[i]);
+ }
+ free(keys->buffers);
+ free(keys);
+ }
+ lib_lookup_destroy(lib_lookup);
return -1;
}
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
+ SamOrder sam_order = by_qname ? QueryName : Coordinate;
+ return bam_merge_core2(sam_order, NULL, out, mode, headers, n, fn, NULL, NULL, flag, reg, 0, "merge", NULL, NULL, 0, NULL, 1);
}
static void merge_usage(FILE *to)
" -b FILE List of input BAM filenames, one per line [null]\n"
" -X Use customized index files\n"
" -L FILE Specify a BED file for multiple region filtering [null]\n"
-" --no-PG do not add a PG line\n");
+" --no-PG do not add a PG line\n"
+" --template-coordinate Input files are sorted by template-coordinate\n");
sam_global_opt_help(to, "-.O..@..");
}
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, flag = 0, ret = 0, level = -1, has_index_file = 0;
+ int c, flag = 0, ret = 0, level = -1, has_index_file = 0;
char *fn_headers = NULL, *reg = NULL, mode[12];
char *sort_tag = NULL, *fnout = NULL, *arg_list = NULL;
long random_seed = (long)time(NULL);
char** fn = NULL;
char** fn_idx = NULL, *fn_bed = NULL;
int fn_size = 0, no_pg = 0;
+ SamOrder sam_order = Coordinate;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{"no-PG", no_argument, NULL, 1},
+ { "template-coordinate", no_argument, NULL, 2},
{ NULL, 0, NULL, 0 }
};
case 'r': flag |= MERGE_RG; break;
case 'f': flag |= MERGE_FORCE; break;
case 'h': fn_headers = optarg; break;
- case 'n': is_by_qname = 1; break;
+ case 'n': sam_order = QueryName; break;
case 'o': fnout = optarg; break;
case 't': sort_tag = optarg; break;
case '1': flag |= MERGE_LEVEL1; level = 1; break;
break;
}
case 1: no_pg = 1; break;
+ case 2: sam_order = TemplateCoordinate; break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
/* else fall-through */
case '?': merge_usage(samtools_stderr); return 1;
}
}
+ if (sort_tag != NULL) {
+ sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
+ }
+
if (fnout == NULL && argc - optind >= 1) {
fnout = argv[optind];
optind++;
strcpy(mode, "wb");
sam_open_mode(mode+1, fnout, NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
- if (bam_merge_core2(is_by_qname, sort_tag, fnout, mode, fn_headers,
+ if (bam_merge_core2(sam_order, sort_tag, fnout, mode, fn_headers,
fn_size+nargcfiles, fn, fn_idx, fn_bed, flag, reg, ga.nthreads,
"merge", &ga.in, &ga.out, ga.write_index, arg_list, no_pg) < 0)
ret = 1;
* BAM sorting *
***************/
+
typedef struct {
size_t from;
size_t to;
static inline int heap_add_read(heap1_t *heap, int nfiles, samFile **fp,
int num_in_mem, buf_region *in_mem,
- bam1_tag *buf, uint64_t *idx, sam_hdr_t *hout) {
+ bam1_tag *buf, template_coordinate_keys_t *keys,
+ uint64_t *idx, sam_hdr_t *hout,
+ khash_t(const_c2c) *lib_lookup) {
int i = heap->i, res;
if (i < nfiles) { // read from file
res = sam_read1(fp[i], hout, heap->entry.bam_record);
+ if (res >= 0 && g_sam_order == TemplateCoordinate) { // file read OK and TemplateCoordinate order
+ // It is assumed that there are nfiles more keys allocated than keys->n; see allocation in bam_merge_simple
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, keys->n + i); // get the next key to use
+ heap->entry.u.key = template_coordinate_key(heap->entry.bam_record, key, hout, lib_lookup); // update the key
+ if (heap->entry.u.key == NULL) res = -1; // key could not be created, error out
+ }
} else { // read from memory
if (in_mem[i - nfiles].from < in_mem[i - nfiles].to) {
- heap->entry.bam_record = buf[in_mem[i - nfiles].from++].bam_record;
+ size_t from = in_mem[i - nfiles].from;
+ heap->entry.bam_record = buf[from].bam_record;
+ if (g_sam_order == TemplateCoordinate) heap->entry.u.key = buf[from].u.key;
+ in_mem[i - nfiles].from++;
res = 0;
} else {
res = -1;
heap->pos = (uint64_t)(heap->entry.bam_record->core.pos + 1);
heap->rev = bam_is_rev(heap->entry.bam_record);
heap->idx = (*idx)++;
- if (g_is_by_tag) {
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
heap->entry.u.tag = bam_aux_get(heap->entry.bam_record, g_sort_tag);
- } else {
+ } else if (g_sam_order != TemplateCoordinate) {
heap->entry.u.tag = NULL;
+ heap->entry.u.key = NULL;
}
} else if (res == -1) {
heap->pos = HEAP_EMPTY;
if (i < nfiles) bam_destroy1(heap->entry.bam_record);
heap->entry.bam_record = NULL;
heap->entry.u.tag = NULL;
+ heap->entry.u.key = NULL;
} else {
return -1;
}
return 0;
}
-static int bam_merge_simple(int by_qname, char *sort_tag, const char *out,
+static int bam_merge_simple(SamOrder sam_order, char *sort_tag, const char *out,
const char *mode, sam_hdr_t *hout,
int n, char * const *fn, int num_in_mem,
- buf_region *in_mem, bam1_tag *buf, int n_threads,
+ buf_region *in_mem, bam1_tag *buf,
+ template_coordinate_keys_t *keys,
+ khash_t(const_c2c) *lib_lookup, int n_threads,
const char *cmd, const htsFormat *in_fmt,
const htsFormat *out_fmt, char *arg_list, int no_pg,
int write_index) {
int i, heap_size = n + num_in_mem;
char *out_idx_fn = NULL;
- g_is_by_qname = by_qname;
- if (sort_tag) {
- g_is_by_tag = 1;
+ if (sam_order == TagQueryName || sam_order == TagCoordinate) {
g_sort_tag[0] = sort_tag[0];
g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0';
}
heap = (heap1_t*)calloc(heap_size, sizeof(heap1_t));
if (!heap) goto mem_fail;
+ // Make sure that there's enough memory for template coordinate keys, one per file to read
+ if (keys && keys->n + n >= keys->m * keys->buffer_size) {
+ if (template_coordinate_keys_realloc(keys, keys->n + n) < 0) goto mem_fail;
+ }
+
// Open each file, read the header and put the first read into the heap
for (i = 0; i < heap_size; i++) {
sam_hdr_t *hin;
// Get a read into the heap
h->i = i;
h->entry.u.tag = NULL;
+ h->entry.u.key = NULL;
if (i < n) {
h->entry.bam_record = bam_init1();
if (!h->entry.bam_record) goto mem_fail;
}
- if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ if (heap_add_read(h, n, fp, num_in_mem, in_mem, buf, keys, &idx, hout,
+ lib_lookup) < 0) {
assert(i < n);
print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
ks_heapmake(heap, heap_size, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->entry.bam_record;
- if (g_is_by_minhash && b->core.tid == -1) {
+ if (g_sam_order == MinHash && b->core.tid == -1) {
// Remove the cached minhash value
b->core.pos = -1;
b->core.mpos = -1;
print_error_errno(cmd, "failed writing to \"%s\"", out);
goto fail;
}
- if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, &idx, hout) < 0) {
+ if (heap_add_read(heap, n, fp, num_in_mem, in_mem, buf, keys, &idx,
+ hout, lib_lookup) < 0) {
assert(heap->i < n);
print_error(cmd, "Error reading \"%s\" : %s",
fn[heap->i], strerror(errno));
static inline int bam1_cmp_core(const bam1_tag a, const bam1_tag b)
{
uint64_t pa, pb;
- if (!a.bam_record)
- return 1;
- if (!b.bam_record)
- return 0;
+ if (!a.bam_record) return 1;
+ if (!b.bam_record) return 0;
- if (g_is_by_qname) {
+ if (g_sam_order == QueryName || g_sam_order == TagQueryName) {
int t = strnum_cmp(bam_get_qname(a.bam_record), bam_get_qname(b.bam_record));
if (t != 0) return t;
return (int) (a.bam_record->core.flag&0xc0) - (int) (b.bam_record->core.flag&0xc0);
if (!A) return 1;
if (!B) return 0;
- if (A->core.tid != -1 || B->core.tid != -1)
- return bam1_cmp_core(a,b);
+ if (A->core.tid != -1 || B->core.tid != -1) return bam1_cmp_core(a,b);
const uint64_t m_a = (((uint64_t)A->core.pos)<<32)|(uint32_t)A->core.mpos;
const uint64_t m_b = (((uint64_t)B->core.pos)<<32)|(uint32_t)B->core.mpos;
return bam1_cmp_core(a,b);
}
+// compares to molecular identifiers, ignoring any trailing slash and subsequent single-character
+// * if mid1 is less than mid2, then -1 will be returned
+// * if mid1 is greater than mid2, then 1 will be returned
+static inline int template_coordinate_key_compare_mid(const char* mid1, const char* mid2) {
+ size_t i = 0;
+ size_t len1 = strlen(mid1);
+ size_t len2 = strlen(mid2);
+ size_t shortest;
+
+ // Snip off trailing slash followed by a single character, if present
+ if (len1 >= 2 && mid1[len1-2] == '/') len1 -= 2;
+ if (len2 >= 2 && mid2[len2-2] == '/') len2 -= 2;
+ shortest = len1 < len2 ? len1 : len2;
+
+ // find first mismatching character
+ while (i < shortest && mid1[i] == mid2[i]) i++;
+
+ // compare last characters
+ if (i == len1 && i < len2) return -1; // mid1 shorter
+ if (i == len2 && i < len1) return 1; // mid2 shorter
+ if (i == len1 && i == len2) return 0; // all characters match
+ if (mid1[i] < mid2[i]) return -1; // mid1 earlier
+ else return 1;
+}
+
+
+// Builds a key use to sort in TemplateCoordinate order. Returns NULL if the key could not be created (e.g. MC
+// tag is missing), otherwise the pointer to the provided key.
+static template_coordinate_key_t* template_coordinate_key(bam1_t *b, template_coordinate_key_t *key, sam_hdr_t *hdr, khash_t(const_c2c) *lib_lookup) {
+ uint8_t *data;
+ char *rg;
+ khiter_t k;
+
+ // defaults
+ key->tid1 = key->tid2 = INT32_MAX;
+ key->pos1 = key->pos2 = HTS_POS_MAX;
+ key->neg1 = key->neg2 = false;
+ key->mid = "";
+
+ // update values
+ rg = (char *)bam_aux_get(b, "RG");
+ if (rg && rg[0] == 'Z'
+ &&(k = kh_get(const_c2c, lib_lookup, rg + 1)) < kh_end(lib_lookup)) {
+ key->library = kh_value(lib_lookup, k);
+ } else {
+ key->library = "";
+ }
+ key->name = bam_get_qname(b);
+ if (!(b->core.flag & BAM_FUNMAP)) { // read is mapped, update coordinates
+ key->tid1 = b->core.tid;
+ key->neg1 = bam_is_rev(b);
+ key->pos1 = (key->neg1) ? unclipped_end(b) : unclipped_start(b);
+ }
+ if (b->core.flag & BAM_FPAIRED && !(b->core.flag & BAM_FMUNMAP)) { // mate is mapped, update coordinates
+ char *cigar;
+ if ((data = bam_aux_get(b, "MC"))) {
+ if (!(cigar = bam_aux2Z(data))) {
+ fprintf(samtools_stderr, "[bam_sort] error: MC tag wrong type. Please use the MC tag provided by samtools fixmate.\n");
+ return NULL;
+ }
+ } else {
+ fprintf(samtools_stderr, "[bam_sort] error: no MC tag. Please run samtools fixmate on file first.\n");
+ return NULL;
+ }
+ key->tid2 = b->core.mtid;
+ key->neg2 = bam_is_mrev(b);
+ key->pos2 = (key->neg2) ? unclipped_other_end(b->core.mpos, cigar) : unclipped_other_start(b->core.mpos, cigar);
+ }
+
+ if ((data = bam_aux_get(b, "MI"))) {
+ if (!(key->mid=bam_aux2Z(data))) {
+ fprintf(samtools_stderr, "[bam_sort] error: MI tag wrong type (not a string).\n");
+ return NULL;
+ }
+ }
+
+ // set is_upper_of_pair, and swap if we get the same key regardless of which end
+ // of the pair it is
+ if (key->tid1 < key->tid2
+ || (key->tid1 == key->tid2 && key->pos1 < key->pos2)
+ || (key->tid1 == key->tid2 && key->pos1 == key->pos2 && !key->neg1)) {
+ key->is_upper_of_pair = false;
+ } else {
+ key->is_upper_of_pair = true;
+ // swap
+ int tmp_tid;
+ hts_pos_t tmp_pos;
+ bool tmp_neg;
+ tmp_tid = key->tid1;
+ key->tid1 = key->tid2;
+ key->tid2 = tmp_tid;
+ tmp_pos = key->pos1;
+ key->pos1 = key->pos2;
+ key->pos2 = tmp_pos;
+ tmp_neg = key->neg1;
+ key->neg1 = key->neg2;
+ key->neg2 = tmp_neg;
+ }
+
+ return key;
+}
+
+// Function to compare reads and determine which one is < or > the other
+// Handles template-coordinate, which sorts by:
+// 1. the earlier unclipped 5' coordinate of the read pair
+// 2. the higher unclipped 5' coordinate of the read pair
+// 3. library (from read group)
+// 4. the molecular identifier (if present)
+// 5. read name
+// 6. if unpaired, or if R1 has the lower coordinates of the pair
+// Returns a value less than, equal to or greater than zero if a is less than,
+// equal to or greater than b, respectively.
+static inline int bam1_cmp_template_coordinate(const bam1_tag a, const bam1_tag b)
+{
+ if (!a.bam_record) return 1;
+ if (!b.bam_record) return 0;
+
+ const template_coordinate_key_t* key_a = a.u.key;
+ const template_coordinate_key_t* key_b = b.u.key;
+
+ int retval = 0;
+ if (0 == retval) retval = key_a->tid1 - key_b->tid1;
+ if (0 == retval) retval = key_a->tid2 - key_b->tid2;
+ if (0 == retval) retval = key_a->pos1 < key_b->pos1 ? -1 : (key_a->pos1 > key_b->pos1 ? 1 : 0);
+ if (0 == retval) retval = key_a->pos2 < key_b->pos2 ? -1 : (key_a->pos2 > key_b->pos2 ? 1 : 0);
+ if (0 == retval) retval = key_a->neg1 == key_b->neg1 ? 0 : (key_a->neg1 ? -1 : 1);
+ if (0 == retval) retval = key_a->neg2 == key_b->neg2 ? 0 : (key_a->neg2 ? -1 : 1);
+ if (0 == retval) retval = strcmp(key_a->library, key_b->library);
+ if (0 == retval) retval = template_coordinate_key_compare_mid(key_a->mid, key_b->mid);
+ if (0 == retval) retval = strcmp(key_a->name, key_b->name);
+ if (0 == retval) retval = key_a->is_upper_of_pair == key_b->is_upper_of_pair ? 0 : (key_a->is_upper_of_pair ? 1 : -1);
+ return retval < 0 ? -1 : (retval > 0 ? 1 : 0);
+}
+
+
// Function to compare reads and determine which one is < the other
-// Handle sort-by-pos, sort-by-name, or sort-by-tag
+// Handle sort-by-pos, sort-by-name, sort-by-tag, or sort-by-template-coordinate.
static inline int bam1_lt(const bam1_tag a, const bam1_tag b)
{
- if (g_is_by_tag) {
- return bam1_cmp_by_tag(a, b) < 0;
- } else if (g_is_by_minhash) {
- return bam1_cmp_by_minhash(a, b) < 0;
- } else {
- return bam1_cmp_core(a,b) < 0;
+ switch (g_sam_order) {
+ case Coordinate:
+ case QueryName:
+ return bam1_cmp_core(a, b) < 0;
+ case TagQueryName:
+ case TagCoordinate:
+ return bam1_cmp_by_tag(a, b) < 0;
+ case MinHash:
+ return bam1_cmp_by_minhash(a, b) < 0;
+ case TemplateCoordinate:
+ return bam1_cmp_template_coordinate(a, b) < 0;
+ default:
+ return bam1_cmp_core(a,b) < 0;
}
}
int error;
int no_save;
int large_pos;
+ int minimiser_kmer;
} worker_t;
// Returns 0 for success
}
//--- End of candidates to punt to htslib
+
+static inline void worker_minhash(worker_t *w) {
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ bam1_t *b = w->buf[i].bam_record;
+ if (b->core.tid != -1)
+ continue;
+
+ int pos = 0, rev = 0;
+ uint64_t mh = minhash(b, w->minimiser_kmer, &pos, &rev);
+ if (rev)
+ reverse_complement(b);
+
+ // Store 64-bit hash in unmapped pos and mpos fields.
+ // The position of hash is in isize, which we use for
+ // resolving ties when sorting by hash key.
+ // These are unused for completely unmapped data and
+ // will be reset during final output.
+ b->core.pos = mh>>31;
+ b->core.mpos = mh&0x7fffffff;
+ b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ }
+}
+
static void *worker(void *data)
{
worker_t *w = (worker_t*)data;
w->error = 0;
w->tmpfile_name = NULL;
- if (!g_is_by_qname && !g_is_by_tag && !g_is_by_minhash) {
- if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
- w->error = errno;
- return NULL;
- }
- } else {
- if (g_is_by_minhash) {
- int i;
- for (i = 0; i < w->buf_len; i++) {
- bam1_t *b = w->buf[i].bam_record;
- if (b->core.tid != -1)
- continue;
-
- int pos = 0, rev = 0;
- uint64_t mh = minhash(b, g_is_by_minhash, &pos, &rev);
- if (rev)
- reverse_complement(b);
-
- // Store 64-bit hash in unmapped pos and mpos fields.
- // The position of hash is in isize, which we use for
- // resolving ties when sorting by hash key.
- // These are unused for completely unmapped data and
- // will be reset during final output.
- b->core.pos = mh>>31;
- b->core.mpos = mh&0x7fffffff;
- b->core.isize = 65535-pos >=0 ? 65535-pos : 0;
+ switch (g_sam_order) {
+ case Coordinate:
+ if (ks_radixsort(w->buf_len, w->buf, w->h) < 0) {
+ w->error = errno;
+ return NULL;
}
- }
- ks_mergesort(sort, w->buf_len, w->buf, 0);
+ break;
+ case MinHash:
+ worker_minhash(w);
+ // no break, go to merge sort
+ default:
+ ks_mergesort(sort, w->buf_len, w->buf, 0);
}
if (w->no_save)
static int sort_blocks(int n_files, size_t k, bam1_tag *buf, const char *prefix,
const sam_hdr_t *h, int n_threads, buf_region *in_mem,
- int large_pos, char **fns, size_t fns_size)
+ int large_pos, int minimiser_kmer, char **fns, size_t fns_size)
{
int i;
size_t pos, rest;
w[i].index = n_files + i;
w[i].tmpfile_name = NULL;
w[i].large_pos = large_pos;
+ w[i].minimiser_kmer = minimiser_kmer;
if (in_mem) {
w[i].no_save = 1;
in_mem[i].from = pos;
return n_files + n_threads;
}
+static void lib_lookup_destroy(khash_t(const_c2c) *lib_lookup) {
+ khiter_t k;
+ if (lib_lookup == NULL)
+ return;
+ for (k = kh_begin(lib_lookup); k < kh_end(lib_lookup); k++) {
+ if (kh_exist(lib_lookup, k))
+ free(kh_value(lib_lookup, k));
+ }
+ kh_destroy(const_c2c, lib_lookup);
+}
+
+// Build an RG to LB lookup table, for the template coordinate sort.
+// Returns a populated hash table (which may be empty) on success;
+// NULL on failure.
+static khash_t(const_c2c) * lookup_libraries(sam_hdr_t *header)
+{
+ khash_t(const_c2c) *lib_lookup = kh_init(const_c2c);
+ kstring_t lib_name = KS_INITIALIZE;
+ int num_rg, i, res;
+ if (!lib_lookup)
+ return NULL;
+
+ // Iterate through any RG lines and look for library information
+ num_rg = sam_hdr_count_lines(header, "RG");
+ if (num_rg < 0)
+ goto fail;
+
+ for (i = 0; i < num_rg; i++) {
+ const char *rg_id = sam_hdr_line_name(header, "RG", i);
+ khiter_t k;
+ if (!rg_id)
+ goto fail;
+ res = sam_hdr_find_tag_pos(header, "RG", i, "LB", &lib_name);
+ if (res < -1) // Error
+ goto fail;
+ if (res < 0 || !lib_name.s) // No LB tag
+ continue;
+ // Add to lookup table
+ k = kh_put(const_c2c, lib_lookup, rg_id, &res);
+ if (res < 0) // Error
+ goto fail;
+ if (res > 0) { // Inserted
+ kh_value(lib_lookup, k) = ks_release(&lib_name);
+ }
+ }
+
+ free(lib_name.s);
+
+ return lib_lookup;
+
+ fail:
+ lib_lookup_destroy(lib_lookup);
+ free(lib_name.s);
+ return NULL;
+}
/*!
- @abstract Sort an unsorted BAM file based on the chromosome order
- and the leftmost position of an alignment
+ @abstract Sort an unsorted BAM file based on the provided sort order
- @param is_by_qname whether to sort by query name
- @param sort_by_tag if non-null, sort by the given tag
+ @param sam_order the order in which the sort should occur
+ @param sort_tag the tag to use if sorting by Tag
+ @param minimiser_kmer the kmer size when sorting by MinHash
@param fn name of the file to be sorted
@param prefix prefix of the temporary files (prefix.NNNN.bam are written)
@param fnout name of the final output file to be written
and then merge them by calling bam_merge_simple(). This function is
NOT thread safe.
*/
-int bam_sort_core_ext(int is_by_qname, char* sort_by_tag, const char *fn, const char *prefix,
+int bam_sort_core_ext(SamOrder sam_order, char* sort_tag, int minimiser_kmer,
+ const char *fn, const char *prefix,
const char *fnout, const char *modeout,
- size_t _max_mem, int by_minimiser, int n_threads,
+ size_t _max_mem, int n_threads,
const htsFormat *in_fmt, const htsFormat *out_fmt,
char *arg_list, int no_pg, int write_index)
{
int ret = -1, res, i, nref, n_files = 0;
size_t max_k, k, max_mem, bam_mem_offset;
sam_hdr_t *header = NULL;
- samFile *fp;
+ samFile *fp = NULL;
bam1_tag *buf = NULL;
+ template_coordinate_keys_t *keys = NULL;
bam1_t *b = bam_init1();
uint8_t *bam_mem = NULL;
char **fns = NULL;
size_t fns_size = 0;
- const char *new_so;
+ const char *new_so = NULL;
+ const char *new_go = NULL;
+ const char *new_ss = NULL;
buf_region *in_mem = NULL;
+ khash_t(const_c2c) *lib_lookup = NULL;
int num_in_mem = 0;
int large_pos = 0;
}
if (n_threads < 2) n_threads = 1;
- g_is_by_qname = is_by_qname;
- g_is_by_minhash = by_minimiser;
- if (sort_by_tag) {
- g_is_by_tag = 1;
- g_sort_tag[0] = sort_by_tag[0];
- g_sort_tag[1] = sort_by_tag[0] ? sort_by_tag[1] : '\0';
+ g_sam_order = sam_order;
+ if (g_sam_order == TagQueryName || g_sam_order == TagCoordinate) {
+ g_sort_tag[0] = sort_tag[0];
+ g_sort_tag[1] = sort_tag[0] ? sort_tag[1] : '\0';
+ }
+
+ if (sam_order == TemplateCoordinate) {
+ if ((keys = malloc(sizeof(template_coordinate_keys_t))) == NULL) {
+ print_error("sort", "could not allocate memory for the top-level keys");
+ goto err;
+ }
+ keys->n = 0;
+ keys->m = 0;
+ keys->buffer_size = 0x10000;
+ keys->buffers = NULL;
}
max_mem = _max_mem * n_threads;
}
}
- if (sort_by_tag != NULL)
- new_so = "unknown";
- else if (is_by_qname)
- new_so = "queryname";
- else
- new_so = "coordinate";
+ if (g_sam_order == TemplateCoordinate) {
+ lib_lookup = lookup_libraries(header);
+ if (!lib_lookup)
+ goto err;
+ }
- if (by_minimiser) {
- const char *new_ss = "coordinate:minhash";
+ switch (g_sam_order) {
+ case Coordinate:
+ new_so = "coordinate";
+ break;
+ case QueryName:
+ new_so = "queryname";
+ break;
+ case MinHash:
+ new_so = "coordinate";
+ new_ss = "coordinate:minhash";
+ break;
+ case TagQueryName:
+ case TagCoordinate:
+ new_so = "unknown";
+ break;
+ case TemplateCoordinate:
+ new_so = "unsorted";
+ new_go = "query";
+ new_ss = "unsorted:template-coordinate";
+ break;
+ default:
+ new_so = "unknown";
+ break;
+ }
+
+ if (new_ss == NULL && new_go == NULL) { // just SO
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+ goto err;
+ }
+ } else if (new_ss != NULL && new_go == NULL) { // update SO and SS, but not GO
if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "SS", new_ss))
&& (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
"SO", new_so, "SS", new_ss, NULL))
new_so, new_ss);
goto err;
}
- } else {
- if ((-1 == sam_hdr_update_hd(header, "SO", new_so))
- && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION, "SO", new_so, NULL))
+ } else if (new_ss == NULL && new_go != NULL) { // update SO and GO, but not SS
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+ "SO", new_so, "GO", new_go, NULL))
) {
- print_error("sort", "failed to change sort order header to 'SO:%s'\n", new_so);
+ print_error("sort", "failed to change sort order header to 'SO:%s GO:%s'\n",
+ new_so, new_go);
+ goto err;
+ }
+ } else { // update SO, GO, and SS
+ if ((-1 == sam_hdr_update_hd(header, "SO", new_so, "GO", new_go, "SS", new_ss))
+ && (-1 == sam_hdr_add_line(header, "HD", "VN", SAM_FORMAT_VERSION,
+ "SO", new_so, "GO", new_go, "SS", new_ss, NULL))
+ ) {
+ print_error("sort", "failed to change sort order header to 'SO:%s GO:%s SS:%s'\n",
+ new_so, new_go, new_ss);
goto err;
}
}
- if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
- print_error("sort", "failed to delete group order header\n");
- goto err;
+ if (new_go == NULL) {
+ if (-1 == sam_hdr_remove_tag_hd(header, "GO")) {
+ print_error("sort", "failed to delete group order in header\n");
+ goto err;
+ }
+ }
+ if (new_ss == NULL) {
+ if (-1 == sam_hdr_remove_tag_hd(header, "SS")) {
+ print_error("sort", "failed to delete sub sort in header\n");
+ goto err;
+ }
}
// No gain to using the thread pool here as the flow of this code
}
buf = new_buf;
}
+ if (sam_order == TemplateCoordinate && k >= keys->m * keys->buffer_size) {
+ if (template_coordinate_keys_realloc(keys, k + 1) == -1) {
+ goto err;
+ }
+ }
// Check if the BAM record will fit in the memory limit
if (bam_mem_offset + sizeof(*b) + b->l_data < max_mem) {
mem_full = 1;
}
- // Pull out the value of the position
- // or the pointer to the sort tag if applicable
- if (g_is_by_tag) {
- buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
- } else {
- buf[k].u.tag = NULL;
+ // Set the tag if sorting by tag, or the key for template cooridinate sorting
+ switch (g_sam_order) {
+ case TagQueryName:
+ case TagCoordinate:
+ buf[k].u.tag = bam_aux_get(buf[k].bam_record, g_sort_tag);
+ break;
+ case TemplateCoordinate:
+ ++keys->n;
+ template_coordinate_key_t *key = template_coordinate_keys_get(keys, k);
+ buf[k].u.key = template_coordinate_key(buf[k].bam_record, key, header, lib_lookup);
+ if (buf[k].u.key == NULL) goto err;
+ break;
+ default:
+ buf[k].u.tag = NULL;
+ buf[k].u.key = NULL;
}
++k;
&fns_size, &fns, 0) < 0)
goto err;
int new_n = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- NULL, large_pos, fns, fns_size);
+ NULL, large_pos, minimiser_kmer, fns, fns_size);
if (new_n < 0) {
goto err;
} else {
n_files = new_n;
}
k = 0;
+ if (keys != NULL) keys->n = 0;
bam_mem_offset = 0;
}
}
in_mem = calloc(n_threads > 0 ? n_threads : 1, sizeof(in_mem[0]));
if (!in_mem) goto err;
num_in_mem = sort_blocks(n_files, k, buf, prefix, header, n_threads,
- in_mem, large_pos, fns, fns_size);
+ in_mem, large_pos, minimiser_kmer, fns, fns_size);
if (num_in_mem < 0) goto err;
} else {
num_in_mem = 0;
// write the final output
if (n_files == 0 && num_in_mem < 2) { // a single block
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt,
- g_is_by_minhash, arg_list, no_pg, write_index) != 0) {
+ minimiser_kmer, arg_list, no_pg, write_index) != 0) {
print_error_errno("sort", "failed to create \"%s\"", fnout);
goto err;
}
abort();
}
}
- if (bam_merge_simple(is_by_qname, sort_by_tag, fnout, modeout, header,
- n_files, fns, num_in_mem, in_mem, buf,
- n_threads, "sort", in_fmt, out_fmt, arg_list,
- no_pg, write_index) < 0) {
+ char *sort_by_tag = (sam_order == TagQueryName || sam_order == TagCoordinate) ? sort_tag : NULL;
+ if (bam_merge_simple(sam_order, sort_by_tag, fnout, modeout, header,
+ n_files, fns, num_in_mem, in_mem, buf, keys,
+ lib_lookup, n_threads, "sort", in_fmt, out_fmt,
+ arg_list, no_pg, write_index) < 0) {
// Propagate bam_merge_simple() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
}
bam_destroy1(b);
free(buf);
+ if (keys != NULL) {
+ for (i = 0; i < keys->m; ++i) {
+ free(keys->buffers[i]);
+ }
+ free(keys->buffers);
+ free(keys);
+ }
free(bam_mem);
free(in_mem);
+ lib_lookup_destroy(lib_lookup);
sam_hdr_destroy(header);
if (fp) sam_close(fp);
return ret;
char *fnout = calloc(strlen(prefix) + 4 + 1, 1);
if (!fnout) return -1;
sprintf(fnout, "%s.bam", prefix);
- ret = bam_sort_core_ext(is_by_qname, NULL, fn, prefix, fnout, "wb", max_mem, 0, 0, NULL, NULL, NULL, 1, 0);
+ SamOrder sam_order = is_by_qname ? QueryName : Coordinate;
+ g_sam_order = sam_order;
+ ret = bam_sort_core_ext(sam_order, NULL, 0, fn, prefix, fnout, "wb", max_mem, 0, NULL, NULL, NULL, 1, 0);
free(fnout);
return ret;
}
" -t TAG Sort by value of TAG. Uses position as secondary index (or read name if -n is set)\n"
" -o FILE Write final output to FILE rather than standard output\n"
" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
-" --no-PG do not add a PG line\n");
+" --no-PG\n"
+" Do not add a PG line\n"
+" --template-coordinate\n"
+" Sort by template-coordinate\n");
sam_global_opt_help(fp, "-.O..@..");
}
int bam_sort(int argc, char *argv[])
{
size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
- int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1, no_pg = 0;
- int by_minimiser = 0, minimiser_kmer = 20;
+ int c, nargs, ret, o_seen = 0, level = -1, no_pg = 0;
+ SamOrder sam_order = Coordinate;
+ bool by_tag = false;
+ int minimiser_kmer = 20;
char* sort_tag = NULL, *arg_list = NULL;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{"no-PG", no_argument, NULL, 1},
+ { "template-coordinate", no_argument, NULL, 2},
{ NULL, 0, NULL, 0 }
};
while ((c = getopt_long(argc, argv, "l:m:no:O:T:@:t:MK:u", lopts, NULL)) >= 0) {
switch (c) {
case 'o': fnout = optarg; o_seen = 1; break;
- case 'n': is_by_qname = 1; break;
- case 't': sort_tag = optarg; break;
+ case 'n': sam_order = QueryName; break;
+ case 't': by_tag = true; sort_tag = optarg; break;
case 'm': {
char *q;
max_mem = strtol(optarg, &q, 0);
case 'l': level = atoi(optarg); break;
case 'u': level = 0; break;
case 1: no_pg = 1; break;
- case 'M': by_minimiser = 1; break;
+ case 2: sam_order = TemplateCoordinate; break;
+ case 'M': sam_order = MinHash; break;
case 'K':
minimiser_kmer = atoi(optarg);
if (minimiser_kmer < 1)
}
}
+ // Change sort order if tag sorting is requested. Must update based on secondary index
+ if (by_tag) {
+ sam_order = sam_order == QueryName ? TagQueryName : TagCoordinate;
+ }
+
nargs = argc - optind;
if (nargs == 0 && isatty(STDIN_FILENO)) {
sort_usage(samtools_stdout);
goto sort_end;
}
- if (ga.write_index && (is_by_qname || sort_tag)) {
+ if (ga.write_index && (sam_order == QueryName || sam_order == TagQueryName || sam_order == TagCoordinate || sam_order == TemplateCoordinate)) {
fprintf(samtools_stderr, "[W::bam_sort] Ignoring --write-index as it only works for position sorted files.\n");
ga.write_index = 0;
}
ksprintf(&tmpprefix, "samtools.%d.%u.tmp", (int) getpid(), t % 10000);
}
- ret = bam_sort_core_ext(is_by_qname, sort_tag, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem,
- by_minimiser * minimiser_kmer, ga.nthreads,
+ ret = bam_sort_core_ext(sam_order, sort_tag, (sam_order == MinHash) ? minimiser_kmer : 0,
+ (nargs > 0) ? argv[optind] : "-",
+ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out, arg_list, no_pg, ga.write_index);
if (ret >= 0)
ret = EXIT_SUCCESS;
int main_import(int argc, char *argv[]);
int main_samples(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
+int main_reference(int argc, char *argv[]);
const char *samtools_version()
{
" fastq converts a BAM to a FASTQ\n"
" fasta converts a BAM to a FASTA\n"
" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
+" reference Generates a reference from aligned data\n"
"\n"
" -- Statistics\n"
" bedcov read depth per BED region\n"
else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1);
else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1);
else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1);
+ else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0 || \
strcmp(argv[1], "--version") == 0)
long_version();
int main_import(int argc, char *argv[]);
int main_samples(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
+int main_reference(int argc, char *argv[]);
const char *samtools_version()
{
" fastq converts a BAM to a FASTQ\n"
" fasta converts a BAM to a FASTA\n"
" import Converts FASTA or FASTQ files to SAM/BAM/CRAM\n"
+" reference Generates a reference from aligned data\n"
"\n"
" -- Statistics\n"
" bedcov read depth per BED region\n"
else if (strcmp(argv[1], "ampliconstats") == 0) ret = main_ampliconstats(argc-1, argv+1);
else if (strcmp(argv[1], "samples") == 0) ret = main_samples(argc-1, argv+1);
else if (strcmp(argv[1], "consensus") == 0) ret = main_consensus(argc-1, argv+1);
+ else if (strcmp(argv[1], "reference") == 0) ret = main_reference(argc-1, argv+1);
else if (strcmp(argv[1], "version") == 0 || \
strcmp(argv[1], "--version") == 0)
long_version();
/* bedcov.c -- bedcov subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2018-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
hts_itr_t *iter;
int min_mapQ;
uint32_t flags; // read filtering flags
+ int64_t rcnt;
} aux_t;
static int read_bam(void *data, bam1_t *b)
return ret;
}
+static int incr_rcnt(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ aux_t *aux = (aux_t *)data;
+ aux->rcnt++;
+ return 0;
+}
+
int main_bedcov(int argc, char *argv[])
{
gzFile fp;
kstream_t *ks;
hts_idx_t **idx;
aux_t **aux;
- int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
- int64_t *cnt, *pcov = NULL;;
+ int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0;
+ int skip_DN = 0, do_rcount = 0;
+ int64_t *cnt, *pcov = NULL;
const bam_pileup1_t **plp;
int usage = 0, has_index_file = 0;
uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:c", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
case 'X': has_index_file = 1; break;
+ case 'c': do_rcount = 1; break;
case 'g':
tflags = bam_str2flag(optarg);
if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
fprintf(stderr, " -X use customized index files\n");
fprintf(stderr, " -g <flags> remove the specified flags from the set used to filter out reads\n");
fprintf(stderr, " -G <flags> add the specified flags to the set used to filter out reads\n"
- " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
+ " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n");
fprintf(stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
fprintf(stderr, " -d <int> depth threshold. Number of reference bases with coverage above and"
" including this value will be displayed in a separate column\n");
+ fprintf(stderr, " -c add an additional column showing read count\n");
sam_global_opt_help(stderr, "-.--.--.");
return 1;
}
aux[i]->flags = flags;
}
cnt = calloc(n, sizeof(*cnt));
+
if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
- if (!cnt || (min_depth >= 0 && !pcov)) return 2;
+ if (!cnt || (min_depth >= 0 && !pcov)) {
+ print_error_errno("bedcov", "failed to allocate memory");
+ return 2;
+ }
fp = gzopen(argv[optind], "rb");
if (fp == NULL) {
for (i = 0; i < n; ++i) {
if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
+ aux[i]->rcnt = 0;
}
mplp = bam_mplp_init(n, read_bam, (void**)aux);
memset(cnt, 0, sizeof(*cnt) * n);
if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
+ if (do_rcount)
+ bam_mplp_constructor(mplp, incr_rcnt);
+
while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
if (pos >= beg && pos < end) {
for (i = 0; i < n; ++i) {
kputl(pcov[i], &str);
}
}
+ if (do_rcount) {
+ for (i = 0; i < n; ++i) {
+ kputc('\t', &str);
+ kputl(aux[i]->rcnt, &str);
+ }
+ }
puts(str.s);
bam_mplp_destroy(mplp);
continue;
/* bedcov.c -- bedcov subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013-2014, 2018-2021 Genome Research Ltd.
+ Copyright (C) 2013-2014, 2018-2022 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
hts_itr_t *iter;
int min_mapQ;
uint32_t flags; // read filtering flags
+ int64_t rcnt;
} aux_t;
static int read_bam(void *data, bam1_t *b)
return ret;
}
+static int incr_rcnt(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ aux_t *aux = (aux_t *)data;
+ aux->rcnt++;
+ return 0;
+}
+
int main_bedcov(int argc, char *argv[])
{
gzFile fp;
kstream_t *ks;
hts_idx_t **idx;
aux_t **aux;
- int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0, skip_DN = 0;
- int64_t *cnt, *pcov = NULL;;
+ int *n_plp, dret, i, j, m, n, c, ret, status = 0, min_mapQ = 0;
+ int skip_DN = 0, do_rcount = 0;
+ int64_t *cnt, *pcov = NULL;
const bam_pileup1_t **plp;
int usage = 0, has_index_file = 0;
uint32_t flags = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Q:Xg:G:jd:c", lopts, NULL)) >= 0) {
switch (c) {
case 'Q': min_mapQ = atoi(optarg); break;
case 'X': has_index_file = 1; break;
+ case 'c': do_rcount = 1; break;
case 'g':
tflags = bam_str2flag(optarg);
if (tflags < 0 || tflags > ((BAM_FSUPPLEMENTARY << 1) - 1)) {
fprintf(samtools_stderr, " -X use customized index files\n");
fprintf(samtools_stderr, " -g <flags> remove the specified flags from the set used to filter out reads\n");
fprintf(samtools_stderr, " -G <flags> add the specified flags to the set used to filter out reads\n"
- " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704");
+ " The default set is UNMAP,SECONDARY,QCFAIL,DUP or 0x704\n");
fprintf(samtools_stderr, " -j do not include deletions (D) and ref skips (N) in bedcov computation\n");
fprintf(samtools_stderr, " -d <int> depth threshold. Number of reference bases with coverage above and"
" including this value will be displayed in a separate column\n");
+ fprintf(samtools_stderr, " -c add an additional column showing read count\n");
sam_global_opt_help(samtools_stderr, "-.--.--.");
return 1;
}
aux[i]->flags = flags;
}
cnt = calloc(n, sizeof(*cnt));
+
if (min_depth >= 0) pcov = calloc(n, sizeof(*pcov));
- if (!cnt || (min_depth >= 0 && !pcov)) return 2;
+ if (!cnt || (min_depth >= 0 && !pcov)) {
+ print_error_errno("bedcov", "failed to allocate memory");
+ return 2;
+ }
fp = gzopen(argv[optind], "rb");
if (fp == NULL) {
for (i = 0; i < n; ++i) {
if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
+ aux[i]->rcnt = 0;
}
mplp = bam_mplp_init(n, read_bam, (void**)aux);
memset(cnt, 0, sizeof(*cnt) * n);
if (min_depth >= 0) memset(pcov, 0, sizeof(*pcov) * n);
+ if (do_rcount)
+ bam_mplp_constructor(mplp, incr_rcnt);
+
while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0)
if (pos >= beg && pos < end) {
for (i = 0; i < n; ++i) {
kputl(pcov[i], &str);
}
}
+ if (do_rcount) {
+ for (i = 0; i < n; ++i) {
+ kputc('\t', &str);
+ kputl(aux[i]->rcnt, &str);
+ }
+ }
samtools_puts(str.s);
bam_mplp_destroy(mplp);
continue;
#include "samtools.h"
#include "sam_opts.h"
-const char *VERSION = "0.1";
-
typedef struct { // auxiliary data structure to hold stats on coverage
unsigned long long n_covered_bases;
unsigned long long summed_coverage;
fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
fputc(' ', file_out);
switch (i) {
- case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
+ case 9: fprintf(file_out, "Number of reads: %u", stats[tid].n_selected_reads); break;
case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
case 6: fprintf(file_out, "Percent covered: %.4g%%",
#include "samtools.h"
#include "sam_opts.h"
-const char *VERSION = "0.1";
-
typedef struct { // auxiliary data structure to hold stats on coverage
unsigned long long n_covered_bases;
unsigned long long summed_coverage;
fprintf(file_out, full_utf ? VERTICAL_LINE : "|");
fputc(' ', file_out);
switch (i) {
- case 9: fprintf(file_out, "Number of reads: %i", stats[tid].n_selected_reads); break;
+ case 9: fprintf(file_out, "Number of reads: %u", stats[tid].n_selected_reads); break;
case 8: if (stats[tid].n_reads - stats[tid].n_selected_reads > 0) fprintf(file_out, " (%i filtered)", stats[tid].n_reads - stats[tid].n_selected_reads); break;
case 7: fprintf(file_out, "Covered bases: %sbp", readable_bps(stats[tid].n_covered_bases, buf)); break;
case 6: fprintf(file_out, "Percent covered: %.4g%%",
#include <unistd.h>
#include <zlib.h>
#include <getopt.h>
+#include "htslib/khash.h"
#include "htslib/kseq.h"
#include "htslib/hts.h"
+#include "samtools.h"
+KHASH_SET_INIT_STR(str)
KSEQ_INIT(gzFile, gzread)
typedef struct _args_t
{
- char *output_fname, *fname;
+ char *output_fname, *alt_fname;
char *assembly, *species, *uri;
int alias, header;
+ khash_t(str) *is_alt;
}
args_t;
fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (fp == 0) {
- fprintf(stderr, "dict: %s: No such file or directory\n", fn);
+ print_error_errno("dict", "Cannot open %s", fn);
exit(1);
}
FILE *out = stdout;
if (args->output_fname) {
out = fopen(args->output_fname, "w");
if (out == NULL) {
- fprintf(stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
+ print_error_errno("dict", "Cannot open %s for writing", args->output_fname);
exit(1);
}
}
hts_md5_final(digest, md5);
hts_md5_hex(hex, digest);
fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+ if (args->is_alt && kh_get(str, args->is_alt, seq->name.s) != kh_end(args->is_alt))
+ fprintf(out, "\tAH:*");
if (args->alias) {
const char *name = seq->name.s;
if (strncmp(name, "chr", 3) == 0) {
gzclose(fp);
}
+static void read_alt_file(khash_t(str) *is_alt, const char *fname)
+{
+ htsFile *fp = hts_open(fname, "r");
+ if (fp == NULL) {
+ print_error_errno("dict", "Cannot open %s", fname);
+ exit(1);
+ }
+
+ // .alt files are in a SAM-like format, but we don't use sam_read1()
+ // as these files may not have a complete set of @SQ headers.
+
+ kstring_t str = KS_INITIALIZE;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.l == 0 || str.s[0] == '@') continue;
+
+ char *tab = strchr(str.s, '\t');
+ if (tab) *tab = '\0';
+
+ int ret;
+ char *seqname = strdup(str.s);
+ kh_put(str, is_alt, seqname, &ret);
+ if (ret == 0) free(seqname); // Already present
+ }
+
+ ks_free(&str);
+ hts_close(fp);
+}
+
static int dict_usage(void)
{
fprintf(stderr, "\n");
fprintf(stderr, " -A, --alias, --alternative-name\n");
fprintf(stderr, " add AN tag by adding/removing 'chr'\n");
fprintf(stderr, " -H, --no-header do not print @HD line\n");
+ fprintf(stderr, " -l, --alt FILE add AH:* tag to alternate locus sequences\n");
fprintf(stderr, " -o, --output FILE file to write out dict file [stdout]\n");
fprintf(stderr, " -s, --species STR species\n");
fprintf(stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
{"help", no_argument, NULL, 'h'},
{"no-header", no_argument, NULL, 'H'},
{"alias", no_argument, NULL, 'A'},
+ {"alt", required_argument, NULL, 'l'},
{"alternative-name", no_argument, NULL, 'A'},
{"assembly", required_argument, NULL, 'a'},
{"species", required_argument, NULL, 's'},
{NULL, 0, NULL, 0}
};
int c;
- while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
+ while ( (c=getopt_long(argc,argv,"?AhHa:l:s:u:o:",loptions,NULL))>0 )
{
switch (c)
{
case 'A': args->alias = 1; break;
case 'a': args->assembly = optarg; break;
+ case 'l': args->alt_fname = optarg; break;
case 's': args->species = optarg; break;
case 'u': args->uri = optarg; break;
case 'o': args->output_fname = optarg; break;
}
else fname = argv[optind];
+ if (args->alt_fname) {
+ args->is_alt = kh_init(str);
+ read_alt_file(args->is_alt, args->alt_fname);
+ }
+
write_dict(fname, args);
+
+ if (args->is_alt) {
+ khint_t k;
+ for (k = 0; k < kh_end(args->is_alt); ++k)
+ if (kh_exist(args->is_alt, k)) free((char *) kh_key(args->is_alt, k));
+ kh_destroy(str, args->is_alt);
+ }
+
free(args);
return 0;
}
#include <unistd.h>
#include <zlib.h>
#include <getopt.h>
+#include "htslib/khash.h"
#include "htslib/kseq.h"
#include "htslib/hts.h"
+#include "samtools.h"
+KHASH_SET_INIT_STR(str)
KSEQ_INIT(gzFile, gzread)
typedef struct _args_t
{
- char *output_fname, *fname;
+ char *output_fname, *alt_fname;
char *assembly, *species, *uri;
int alias, header;
+ khash_t(str) *is_alt;
}
args_t;
fp = strcmp(fn, "-") ? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
if (fp == 0) {
- fprintf(samtools_stderr, "dict: %s: No such file or directory\n", fn);
+ print_error_errno("dict", "Cannot open %s", fn);
samtools_exit(1);
}
FILE *out = samtools_stdout;
if (args->output_fname) {
out = fopen(args->output_fname, "w");
if (out == NULL) {
- fprintf(samtools_stderr, "dict: %s: Cannot open file for writing\n", args->output_fname);
+ print_error_errno("dict", "Cannot open %s for writing", args->output_fname);
samtools_exit(1);
}
}
hts_md5_final(digest, md5);
hts_md5_hex(hex, digest);
fprintf(out, "@SQ\tSN:%s\tLN:%d\tM5:%s", seq->name.s, k, hex);
+ if (args->is_alt && kh_get(str, args->is_alt, seq->name.s) != kh_end(args->is_alt))
+ fprintf(out, "\tAH:*");
if (args->alias) {
const char *name = seq->name.s;
if (strncmp(name, "chr", 3) == 0) {
gzclose(fp);
}
+static void read_alt_file(khash_t(str) *is_alt, const char *fname)
+{
+ htsFile *fp = hts_open(fname, "r");
+ if (fp == NULL) {
+ print_error_errno("dict", "Cannot open %s", fname);
+ samtools_exit(1);
+ }
+
+ // .alt files are in a SAM-like format, but we don't use sam_read1()
+ // as these files may not have a complete set of @SQ headers.
+
+ kstring_t str = KS_INITIALIZE;
+ while (hts_getline(fp, KS_SEP_LINE, &str) >= 0) {
+ if (str.l == 0 || str.s[0] == '@') continue;
+
+ char *tab = strchr(str.s, '\t');
+ if (tab) *tab = '\0';
+
+ int ret;
+ char *seqname = strdup(str.s);
+ kh_put(str, is_alt, seqname, &ret);
+ if (ret == 0) free(seqname); // Already present
+ }
+
+ ks_free(&str);
+ hts_close(fp);
+}
+
static int dict_usage(void)
{
fprintf(samtools_stderr, "\n");
fprintf(samtools_stderr, " -A, --alias, --alternative-name\n");
fprintf(samtools_stderr, " add AN tag by adding/removing 'chr'\n");
fprintf(samtools_stderr, " -H, --no-header do not print @HD line\n");
+ fprintf(samtools_stderr, " -l, --alt FILE add AH:* tag to alternate locus sequences\n");
fprintf(samtools_stderr, " -o, --output FILE file to write out dict file [samtools_stdout]\n");
fprintf(samtools_stderr, " -s, --species STR species\n");
fprintf(samtools_stderr, " -u, --uri STR URI [file:///abs/path/to/file.fa]\n");
{"help", no_argument, NULL, 'h'},
{"no-header", no_argument, NULL, 'H'},
{"alias", no_argument, NULL, 'A'},
+ {"alt", required_argument, NULL, 'l'},
{"alternative-name", no_argument, NULL, 'A'},
{"assembly", required_argument, NULL, 'a'},
{"species", required_argument, NULL, 's'},
{NULL, 0, NULL, 0}
};
int c;
- while ( (c=getopt_long(argc,argv,"?AhHa:s:u:o:",loptions,NULL))>0 )
+ while ( (c=getopt_long(argc,argv,"?AhHa:l:s:u:o:",loptions,NULL))>0 )
{
switch (c)
{
case 'A': args->alias = 1; break;
case 'a': args->assembly = optarg; break;
+ case 'l': args->alt_fname = optarg; break;
case 's': args->species = optarg; break;
case 'u': args->uri = optarg; break;
case 'o': args->output_fname = optarg; break;
}
else fname = argv[optind];
+ if (args->alt_fname) {
+ args->is_alt = kh_init(str);
+ read_alt_file(args->is_alt, args->alt_fname);
+ }
+
write_dict(fname, args);
+
+ if (args->is_alt) {
+ khint_t k;
+ for (k = 0; k < kh_end(args->is_alt); ++k)
+ if (kh_exist(args->is_alt, k)) free((char *) kh_key(args->is_alt, k));
+ kh_destroy(str, args->is_alt);
+ }
+
free(args);
return 0;
}
--- /dev/null
+/* bam_reference.c -- extracts an embedded reference from a CRAM file,
+ or creates it from alignments plus MD:Z tags.
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "htslib/sam.h"
+#include "htslib/cram.h"
+#include "samtools.h"
+#include "sam_opts.h"
+
+
+/*
+ * There are two main modes of operation.
+ *
+ * 1. Extracting the reference from the CRAM file embed_ref blocks.
+ * 2. Generation of reference by analysing consensus plus patches applied
+ * via MD tags.
+ *
+ * The first is very rapid, but only applies to a CRAM files generated with
+ * the specific options (not commonly used and not the default). The second
+ * is a slow operation, but applies to any data type.
+ *
+ * This is also a testing ground for a future CRAM auto-embed-ref option that
+ * permits the use of an embedded reference without having to first extract
+ * the reference. (Note this may require the creation of MD tags during
+ * decode by use of an existing embedded reference, if the records don't
+ * have an MD tag themselves, but that's an issue for htslib when we get
+ * there.)
+ */
+
+/*
+ * ---------------------------------------------------------------------------
+ * Shared utility functions by both methods.
+ */
+
+#define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL)
+#define MIN(a,b) ((a)<(b)?(a):(b))
+static int dump_ref(sam_hdr_t *h, hts_itr_t *iter, int ref_id,
+ char *ref, uint64_t ref_len, FILE *fp, int verbose) {
+ int N = 0;
+ if (iter && iter->end >= HTS_POS_MAX)
+ iter->end = ref_len;
+ if (iter && (iter->beg > 0 || iter->end < ref_len)) {
+ fprintf(fp, ">%s:%"PRIhts_pos"-%"PRIhts_pos"\n",
+ sam_hdr_tid2name(h, ref_id), iter->beg+1, iter->end);
+ ref += iter->beg;
+ ref_len = MIN(ref_len, iter->end) - iter->beg;
+ } else {
+ fprintf(fp, ">%s\n", sam_hdr_tid2name(h, ref_id));
+ }
+
+ int i, j;
+ uint64_t rem = ref_len;
+
+ // Count coverage, purely for information purposes.
+ // About 90% of dump_ref CPU is here, so maybe this isn't useful,
+ // but this is still 3-4x faster than the obvious naive loop.
+ //
+ // Overall though it's only about 5% overhead of the entire process
+ // (was ~20%).
+ if (verbose) {
+ int n4[8] = {0};
+ for (j = 0; j < ref_len && (((uintptr_t) &ref[j] & 7) != 0); j++)
+ N += ref[j] == 'N';
+ uint64_t fast_end = ((ref_len - j) & ~7) + j;
+ for (; j < fast_end; j+=8) {
+ uint64_t i64 = *(uint64_t *)&ref[j];
+ if (!haszero(i64 ^ 0x4e4e4e4e4e4e4e4eUL)) // 'N' <-> 0
+ continue;
+
+ n4[0] += ref[j+0] == 'N';
+ n4[1] += ref[j+1] == 'N';
+ n4[2] += ref[j+2] == 'N';
+ n4[3] += ref[j+3] == 'N';
+ n4[4] += ref[j+4] == 'N';
+ n4[5] += ref[j+5] == 'N';
+ n4[6] += ref[j+6] == 'N';
+ n4[7] += ref[j+7] == 'N';
+ }
+ for (; j < ref_len; j++)
+ N += ref[j] == 'N';
+ N += n4[0]+n4[1]+n4[2]+n4[3]+
+ n4[4]+n4[5]+n4[6]+n4[7];
+ }
+
+ // Format reference
+ for (i = 0; i < ref_len; i += 60, rem -= 60) {
+ int len = (int)(rem < 60 ? rem : 60);
+ if (fwrite(ref, 1, len, fp) != len)
+ return -1;
+ putc('\n', fp);
+ ref += 60;
+ }
+
+ if (verbose)
+ fprintf(stderr, "Dump ref %d len %"PRId64", coverage %.2f%%\n",
+ ref_id, ref_len, 100 - N*100.0 / ref_len);
+
+ return 0;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * CRAM embedded reference method of reference construction
+ */
+
+/*
+ * Extracts an embedded reference from a sorted CRAM file.
+ * Modelled on the CRAM container copy loop from bam_cat.c.
+ */
+static int cram2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg,
+ FILE *outfp, int verbose) {
+ cram_fd *in_c;
+ cram_container *c = NULL;
+ cram_block *blk = NULL;
+ cram_block_slice_hdr *shdr = NULL;
+
+ int curr_ref_id = -99;
+ char *ref = NULL;
+ uint64_t ref_len = 0;
+
+ // We have no direct public API for seeking in CRAM to a specific
+ // location by genome coordinates. The sam_itr_query API is
+ // designed for fetching records, rather than seeks to specific
+ // file locations.
+ //
+ // TODO: consider exposing cram_range and cram_seek_to_refpos API.
+ // After a sam_index_load which will add the index to infp, these
+ // functions should seek direct to the start of a container.
+ // Or use cram_index *e =cram_index_query(cram, tid, beg, NULL);
+ //
+ // However, fortuitously(?) sam_itr_querys calls cram_seek_to_refpos
+ // so we can do a region query and let that do the initial seek.
+ // We still need to do our own end-range detection though.
+
+ hts_itr_t *iter = NULL;
+ if (reg) {
+ iter = sam_itr_querys(idx, h, reg);
+ if (!iter) {
+ print_error("reference", "failed to parse region '%s'", reg);
+ goto err;
+ }
+ }
+
+ in_c = in->fp.cram; // low level htslib abuse?
+ int eor = 0;
+ while (!eor && (c = cram_read_container(in_c))) {
+ if (cram_container_is_empty(in_c)) {
+ cram_block *blk;
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ cram_free_block(blk);
+ cram_free_container(c);
+ c = NULL; blk = NULL;
+ continue;
+ }
+
+ // Container compression header; read and discard
+ int32_t num_slices;
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ cram_free_block(blk);
+ blk = NULL;
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ (void)cram_container_get_landmarks(c, &num_slices);
+ int i, j;
+ for (i = 0; i < num_slices; i++) {
+ // Slice header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ if (!(shdr = cram_decode_slice_header(in_c, blk)))
+ goto err;
+ cram_free_block(blk);
+ blk = NULL;
+
+ int num_blocks = cram_slice_hdr_get_num_blocks(shdr);
+ int embed_id = cram_slice_hdr_get_embed_ref_id(shdr);
+ int ref_id;
+ hts_pos_t ref_start, ref_span;
+ cram_slice_hdr_get_coords(shdr, &ref_id, &ref_start, &ref_span);
+
+ if (iter) {
+ if (iter->tid != ref_id || ref_start > iter->end) {
+ // Beyond end of specified region.
+ cram_free_slice_header(shdr);
+ eor = 1;
+ break;
+ }
+ }
+
+ if (embed_id < 0 && ref_id != -1) {
+ fprintf(stderr, "CRAM file has slice without embedded "
+ "reference\n");
+ goto err;
+ }
+
+ if (ref_id != curr_ref_id) {
+ if (curr_ref_id >= 0) {
+ if (dump_ref(h, iter, curr_ref_id, ref, ref_len,
+ outfp, verbose) < 0)
+ goto err;
+ }
+
+ ref_len = sam_hdr_tid2len(h, ref_id);
+ if (ref_len) {
+ char *ref2 = realloc(ref, ref_len);
+ if (!ref2)
+ goto err;
+ else
+ ref = ref2;
+ memset(ref, 'N', ref_len);
+ }
+ curr_ref_id = ref_id;
+ }
+
+ // Slice data blocks
+ for (j = 0; j < num_blocks; j++) {
+ // read and discard, unless it's the ref-ID block
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ if (cram_block_get_content_id(blk) == embed_id) {
+ cram_uncompress_block(blk);
+ //printf("%.*s\n", blk->uncomp_size, blk->data);
+
+ int32_t usize = cram_block_get_uncomp_size(blk);
+ int ref_end = ref_start + usize;
+ if (ref_end > ref_len+1)
+ ref_end = ref_len+1;
+ if (ref_end > ref_start)
+ memcpy(ref + ref_start-1, cram_block_get_data(blk),
+ ref_end - ref_start);
+ }
+ cram_free_block(blk);
+ blk = NULL;
+ }
+ cram_free_slice_header(shdr);
+ shdr = NULL;
+ }
+
+ cram_free_container(c);
+ c = NULL;
+ }
+
+ int ret = 0;
+ if (curr_ref_id >= 0) {
+ ret = dump_ref(h, iter, curr_ref_id, ref, ref_len, outfp, verbose);
+ } else if (reg) {
+ // no data present
+ // no data present, but we explicitly asked for the reference so
+ // report it still as Ns.
+ ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid));
+ ref = malloc(ref_len);
+ memset(ref, 'N', ref_len);
+ if (!ref)
+ goto err;
+ ret = dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose);
+ }
+
+ free(ref);
+ if (iter)
+ hts_itr_destroy(iter);
+
+ return ret;
+
+ err:
+ free(ref);
+ if (blk)
+ cram_free_block(blk);
+ if (shdr)
+ cram_free_slice_header(shdr);
+ if (c)
+ cram_free_container(c);
+ if (iter)
+ hts_itr_destroy(iter);
+
+ return -1;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * MD method of reference construction
+ */
+
+// Returns the next cigar op code: one of the BAM_C* codes,
+// or -1 if no more are present.
+static inline
+int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos,
+ uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
+ for(;;) {
+ while (*cig_len == 0) {
+ if (*cig_ind < *ncigar) {
+ *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK;
+ *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
+ (*cig_ind)++;
+ } else {
+ return -1;
+ }
+ }
+
+ if (skip[*cig_op]) {
+ *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
+ *cig_len = 0;
+ continue;
+ }
+
+ (*cig_len)--;
+ break;
+ }
+
+ return *cig_op;
+}
+
+// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference.
+// Updates ref[] array.
+//
+// Returns >0 on success,
+// 0 on no-MD found,
+// -1 on failure (eg inconsistent data)
+static int build_ref(bam1_t *b, char *ref, size_t ref_len) {
+ uint8_t *seq = bam_get_seq(b);
+ uint32_t *cigar = bam_get_cigar(b);
+ int ncigar = b->core.n_cigar;
+ uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
+
+ const uint8_t *MD = bam_aux_get(b, "MD");
+ if (!MD || *MD != 'Z')
+ return 0;
+ MD++;
+
+ // Walk through MD + seq to generate ref
+ int iseq = 0, iref = b->core.pos, next_op;
+ int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
+ while (iseq < b->core.l_qseq && *MD) {
+ if (isdigit(*MD)) {
+ // match
+ int len = strtol((char *)MD, (char **)&MD, 10);
+ while (iseq < b->core.l_qseq && len) {
+ if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+ &iseq, &cig_ind, &cig_op,
+ &cig_len)) < 0)
+ return -1;
+
+ if (next_op != BAM_CMATCH &&
+ next_op != BAM_CEQUAL) {
+ print_error("MD2ref",
+ "MD:Z and CIGAR are incompatible");
+ return -1;
+ }
+
+ if (iref < ref_len)
+ ref[iref] = seq_nt16_str[bam_seqi(seq, iseq)];
+ iseq++;
+ iref++;
+ len--;
+ }
+ } else if (*MD == '^') {
+ // deletion
+ MD++;
+ while (*MD && isalpha(*MD)) {
+ if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+ &iseq, &cig_ind, &cig_op,
+ &cig_len)) < 0)
+ return -1;
+
+ if (next_op != BAM_CDEL) {
+ print_error("MD2ref",
+ "MD:Z and CIGAR are incompatible");
+ return -1;
+ }
+
+ if (iref < ref_len)
+ ref[iref] = *MD;
+
+ MD++;
+ iref++;
+ }
+ } else {
+ // substitution
+ if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+ &iseq, &cig_ind, &cig_op,
+ &cig_len)) < 0)
+ return -1;
+
+ if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
+ print_error("MD2ref", "MD:Z and CIGAR are incompatible");
+ return -1;
+ }
+ if (iref < ref_len)
+ ref[iref] = *MD;
+
+ MD++;
+ iref++;
+ iseq++;
+ }
+ }
+
+ return 1;
+}
+
+static int MD2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg,
+ FILE *outfp, int verbose) {
+ bam1_t *b = bam_init1();
+ int r, last_tid = -99;
+ size_t ref_len = 0;
+ char *ref = NULL;
+ int ret = -1;
+
+ hts_itr_t *iter = NULL;
+ if (idx && reg) {
+ iter = sam_itr_querys(idx, h, reg);
+ if (!iter) {
+ print_error("reference", "failed to parse region '%s'", reg);
+ goto err;
+ }
+ }
+
+ while ((r = iter
+ ? sam_itr_next(in, iter, b)
+ : sam_read1(in, h, b)) >= 0) {
+ // check b->core.tid and flush old seq.
+ if (b->core.tid != last_tid) {
+ if (last_tid >= 0)
+ if (dump_ref(h, iter, last_tid, ref, ref_len, outfp,
+ verbose) < 0)
+ goto err;
+
+ last_tid = b->core.tid;
+ ref_len = sam_hdr_tid2len(h, last_tid);
+ if (ref_len) {
+ char *ref2 = realloc(ref, ref_len);
+ if (!ref2)
+ goto err;
+ else
+ ref = ref2;
+ memset(ref, 'N', ref_len);
+ }
+ }
+
+ if (build_ref(b, ref, ref_len) < 0)
+ goto err;
+ }
+
+ if (last_tid >= 0) {
+ if (dump_ref(h, iter, last_tid, ref, ref_len, outfp, verbose) < 0)
+ goto err;
+ } else if (reg) {
+ // no data present, but we explicitly asked for the reference so
+ // report it still as Ns.
+ ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid));
+ ref = malloc(ref_len);
+ memset(ref, 'N', ref_len);
+ if (!ref)
+ goto err;
+ if (dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose) < 0)
+ goto err;
+ }
+
+ if (r < -1)
+ goto err;
+
+ ret = 0;
+
+ err:
+ if (iter)
+ hts_itr_destroy(iter);
+ bam_destroy1(b);
+ free(ref);
+ return ret;
+}
+
+int main_reference(int argc, char *argv[])
+{
+ int c, usage = 0, verbose = 1, use_embedded = 0;
+ sam_hdr_t *h = 0;
+ samFile *in = NULL;
+ hts_idx_t *idx = NULL;
+ sam_global_args ga;
+ FILE *outfp = stdout;
+ char *reg = NULL;
+
+ static const struct option lopts[] = {
+ {"output", required_argument, NULL, 'o'},
+ {"quiet", no_argument, NULL, 'q'},
+ {"embedded", no_argument, NULL, 'e'},
+ {"region", required_argument, NULL, 'r'},
+ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'),
+ { NULL, 0, NULL, 0 }
+ };
+
+ sam_global_args_init(&ga);
+
+ while ((c = getopt_long(argc, argv, "@:qo:er:", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'o':
+ if (!(outfp = fopen(optarg, "w"))) {
+ perror(optarg);
+ goto err;
+ }
+ break;
+
+ case 'q':
+ verbose = 0;
+ break;
+
+ case 'e':
+ use_embedded = 1;
+ break;
+
+ case 'r':
+ reg = optarg;
+ break;
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
+ }
+ }
+
+ if ((optind == argc && isatty(0)) || usage) {
+ printf("Usage: samtools reference [-@ N] [-r region] [-e] [-q] [-o out.fa] [in.cram]\n");
+ return 0;
+ }
+
+ char *fn = optind < argc ? argv[optind] : "-";
+ if (!(in = sam_open(fn, "r"))) {
+ print_error_errno("reference", "failed to open file '%s'", fn);
+ return 1;
+ }
+
+ if (ga.nthreads > 0)
+ hts_set_threads(in, ga.nthreads);
+
+ if (!(h = sam_hdr_read(in)))
+ goto err;
+
+ if (reg) {
+ idx = sam_index_load(in, fn);
+ if (!idx) {
+ print_error_errno("reference", "Failed to load the index");
+ goto err;
+ }
+ }
+
+ int ret = use_embedded
+ ? cram2ref(in, h, idx, reg, outfp, verbose)
+ : MD2ref(in, h, idx, reg, outfp, verbose);
+
+ sam_hdr_destroy(h);
+ if (outfp != stdout)
+ fclose(outfp);
+ if (idx)
+ hts_idx_destroy(idx);
+ sam_close(in);
+
+ return ret;
+
+ err:
+ if (idx)
+ hts_idx_destroy(idx);
+ if (in)
+ sam_close(in);
+ if (h)
+ sam_hdr_destroy(h);
+
+ return 1;
+}
--- /dev/null
+#include "samtools.pysam.h"
+
+/* bam_reference.c -- extracts an embedded reference from a CRAM file,
+ or creates it from alignments plus MD:Z tags.
+
+ Copyright (C) 2022 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "htslib/sam.h"
+#include "htslib/cram.h"
+#include "samtools.h"
+#include "sam_opts.h"
+
+
+/*
+ * There are two main modes of operation.
+ *
+ * 1. Extracting the reference from the CRAM file embed_ref blocks.
+ * 2. Generation of reference by analysing consensus plus patches applied
+ * via MD tags.
+ *
+ * The first is very rapid, but only applies to a CRAM files generated with
+ * the specific options (not commonly used and not the default). The second
+ * is a slow operation, but applies to any data type.
+ *
+ * This is also a testing ground for a future CRAM auto-embed-ref option that
+ * permits the use of an embedded reference without having to first extract
+ * the reference. (Note this may require the creation of MD tags during
+ * decode by use of an existing embedded reference, if the records don't
+ * have an MD tag themselves, but that's an issue for htslib when we get
+ * there.)
+ */
+
+/*
+ * ---------------------------------------------------------------------------
+ * Shared utility functions by both methods.
+ */
+
+#define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL)
+#define MIN(a,b) ((a)<(b)?(a):(b))
+static int dump_ref(sam_hdr_t *h, hts_itr_t *iter, int ref_id,
+ char *ref, uint64_t ref_len, FILE *fp, int verbose) {
+ int N = 0;
+ if (iter && iter->end >= HTS_POS_MAX)
+ iter->end = ref_len;
+ if (iter && (iter->beg > 0 || iter->end < ref_len)) {
+ fprintf(fp, ">%s:%"PRIhts_pos"-%"PRIhts_pos"\n",
+ sam_hdr_tid2name(h, ref_id), iter->beg+1, iter->end);
+ ref += iter->beg;
+ ref_len = MIN(ref_len, iter->end) - iter->beg;
+ } else {
+ fprintf(fp, ">%s\n", sam_hdr_tid2name(h, ref_id));
+ }
+
+ int i, j;
+ uint64_t rem = ref_len;
+
+ // Count coverage, purely for information purposes.
+ // About 90% of dump_ref CPU is here, so maybe this isn't useful,
+ // but this is still 3-4x faster than the obvious naive loop.
+ //
+ // Overall though it's only about 5% overhead of the entire process
+ // (was ~20%).
+ if (verbose) {
+ int n4[8] = {0};
+ for (j = 0; j < ref_len && (((uintptr_t) &ref[j] & 7) != 0); j++)
+ N += ref[j] == 'N';
+ uint64_t fast_end = ((ref_len - j) & ~7) + j;
+ for (; j < fast_end; j+=8) {
+ uint64_t i64 = *(uint64_t *)&ref[j];
+ if (!haszero(i64 ^ 0x4e4e4e4e4e4e4e4eUL)) // 'N' <-> 0
+ continue;
+
+ n4[0] += ref[j+0] == 'N';
+ n4[1] += ref[j+1] == 'N';
+ n4[2] += ref[j+2] == 'N';
+ n4[3] += ref[j+3] == 'N';
+ n4[4] += ref[j+4] == 'N';
+ n4[5] += ref[j+5] == 'N';
+ n4[6] += ref[j+6] == 'N';
+ n4[7] += ref[j+7] == 'N';
+ }
+ for (; j < ref_len; j++)
+ N += ref[j] == 'N';
+ N += n4[0]+n4[1]+n4[2]+n4[3]+
+ n4[4]+n4[5]+n4[6]+n4[7];
+ }
+
+ // Format reference
+ for (i = 0; i < ref_len; i += 60, rem -= 60) {
+ int len = (int)(rem < 60 ? rem : 60);
+ if (fwrite(ref, 1, len, fp) != len)
+ return -1;
+ putc('\n', fp);
+ ref += 60;
+ }
+
+ if (verbose)
+ fprintf(samtools_stderr, "Dump ref %d len %"PRId64", coverage %.2f%%\n",
+ ref_id, ref_len, 100 - N*100.0 / ref_len);
+
+ return 0;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * CRAM embedded reference method of reference construction
+ */
+
+/*
+ * Extracts an embedded reference from a sorted CRAM file.
+ * Modelled on the CRAM container copy loop from bam_cat.c.
+ */
+static int cram2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg,
+ FILE *outfp, int verbose) {
+ cram_fd *in_c;
+ cram_container *c = NULL;
+ cram_block *blk = NULL;
+ cram_block_slice_hdr *shdr = NULL;
+
+ int curr_ref_id = -99;
+ char *ref = NULL;
+ uint64_t ref_len = 0;
+
+ // We have no direct public API for seeking in CRAM to a specific
+ // location by genome coordinates. The sam_itr_query API is
+ // designed for fetching records, rather than seeks to specific
+ // file locations.
+ //
+ // TODO: consider exposing cram_range and cram_seek_to_refpos API.
+ // After a sam_index_load which will add the index to infp, these
+ // functions should seek direct to the start of a container.
+ // Or use cram_index *e =cram_index_query(cram, tid, beg, NULL);
+ //
+ // However, fortuitously(?) sam_itr_querys calls cram_seek_to_refpos
+ // so we can do a region query and let that do the initial seek.
+ // We still need to do our own end-range detection though.
+
+ hts_itr_t *iter = NULL;
+ if (reg) {
+ iter = sam_itr_querys(idx, h, reg);
+ if (!iter) {
+ print_error("reference", "failed to parse region '%s'", reg);
+ goto err;
+ }
+ }
+
+ in_c = in->fp.cram; // low level htslib abuse?
+ int eor = 0;
+ while (!eor && (c = cram_read_container(in_c))) {
+ if (cram_container_is_empty(in_c)) {
+ cram_block *blk;
+ // Container compression header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ cram_free_block(blk);
+ cram_free_container(c);
+ c = NULL; blk = NULL;
+ continue;
+ }
+
+ // Container compression header; read and discard
+ int32_t num_slices;
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ cram_free_block(blk);
+ blk = NULL;
+
+ // Container num_blocks can be invalid, due to a bug.
+ // Instead we iterate in slice context instead.
+ (void)cram_container_get_landmarks(c, &num_slices);
+ int i, j;
+ for (i = 0; i < num_slices; i++) {
+ // Slice header
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ if (!(shdr = cram_decode_slice_header(in_c, blk)))
+ goto err;
+ cram_free_block(blk);
+ blk = NULL;
+
+ int num_blocks = cram_slice_hdr_get_num_blocks(shdr);
+ int embed_id = cram_slice_hdr_get_embed_ref_id(shdr);
+ int ref_id;
+ hts_pos_t ref_start, ref_span;
+ cram_slice_hdr_get_coords(shdr, &ref_id, &ref_start, &ref_span);
+
+ if (iter) {
+ if (iter->tid != ref_id || ref_start > iter->end) {
+ // Beyond end of specified region.
+ cram_free_slice_header(shdr);
+ eor = 1;
+ break;
+ }
+ }
+
+ if (embed_id < 0 && ref_id != -1) {
+ fprintf(samtools_stderr, "CRAM file has slice without embedded "
+ "reference\n");
+ goto err;
+ }
+
+ if (ref_id != curr_ref_id) {
+ if (curr_ref_id >= 0) {
+ if (dump_ref(h, iter, curr_ref_id, ref, ref_len,
+ outfp, verbose) < 0)
+ goto err;
+ }
+
+ ref_len = sam_hdr_tid2len(h, ref_id);
+ if (ref_len) {
+ char *ref2 = realloc(ref, ref_len);
+ if (!ref2)
+ goto err;
+ else
+ ref = ref2;
+ memset(ref, 'N', ref_len);
+ }
+ curr_ref_id = ref_id;
+ }
+
+ // Slice data blocks
+ for (j = 0; j < num_blocks; j++) {
+ // read and discard, unless it's the ref-ID block
+ if (!(blk = cram_read_block(in_c)))
+ goto err;
+ if (cram_block_get_content_id(blk) == embed_id) {
+ cram_uncompress_block(blk);
+ //printf("%.*s\n", blk->uncomp_size, blk->data);
+
+ int32_t usize = cram_block_get_uncomp_size(blk);
+ int ref_end = ref_start + usize;
+ if (ref_end > ref_len+1)
+ ref_end = ref_len+1;
+ if (ref_end > ref_start)
+ memcpy(ref + ref_start-1, cram_block_get_data(blk),
+ ref_end - ref_start);
+ }
+ cram_free_block(blk);
+ blk = NULL;
+ }
+ cram_free_slice_header(shdr);
+ shdr = NULL;
+ }
+
+ cram_free_container(c);
+ c = NULL;
+ }
+
+ int ret = 0;
+ if (curr_ref_id >= 0) {
+ ret = dump_ref(h, iter, curr_ref_id, ref, ref_len, outfp, verbose);
+ } else if (reg) {
+ // no data present
+ // no data present, but we explicitly asked for the reference so
+ // report it still as Ns.
+ ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid));
+ ref = malloc(ref_len);
+ memset(ref, 'N', ref_len);
+ if (!ref)
+ goto err;
+ ret = dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose);
+ }
+
+ free(ref);
+ if (iter)
+ hts_itr_destroy(iter);
+
+ return ret;
+
+ err:
+ free(ref);
+ if (blk)
+ cram_free_block(blk);
+ if (shdr)
+ cram_free_slice_header(shdr);
+ if (c)
+ cram_free_container(c);
+ if (iter)
+ hts_itr_destroy(iter);
+
+ return -1;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * MD method of reference construction
+ */
+
+// Returns the next cigar op code: one of the BAM_C* codes,
+// or -1 if no more are present.
+static inline
+int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos,
+ uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
+ for(;;) {
+ while (*cig_len == 0) {
+ if (*cig_ind < *ncigar) {
+ *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK;
+ *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
+ (*cig_ind)++;
+ } else {
+ return -1;
+ }
+ }
+
+ if (skip[*cig_op]) {
+ *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
+ *cig_len = 0;
+ continue;
+ }
+
+ (*cig_len)--;
+ break;
+ }
+
+ return *cig_op;
+}
+
+// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference.
+// Updates ref[] array.
+//
+// Returns >0 on success,
+// 0 on no-MD found,
+// -1 on failure (eg inconsistent data)
+static int build_ref(bam1_t *b, char *ref, size_t ref_len) {
+ uint8_t *seq = bam_get_seq(b);
+ uint32_t *cigar = bam_get_cigar(b);
+ int ncigar = b->core.n_cigar;
+ uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
+
+ const uint8_t *MD = bam_aux_get(b, "MD");
+ if (!MD || *MD != 'Z')
+ return 0;
+ MD++;
+
+ // Walk through MD + seq to generate ref
+ int iseq = 0, iref = b->core.pos, next_op;
+ int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
+ while (iseq < b->core.l_qseq && *MD) {
+ if (isdigit(*MD)) {
+ // match
+ int len = strtol((char *)MD, (char **)&MD, 10);
+ while (iseq < b->core.l_qseq && len) {
+ if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+ &iseq, &cig_ind, &cig_op,
+ &cig_len)) < 0)
+ return -1;
+
+ if (next_op != BAM_CMATCH &&
+ next_op != BAM_CEQUAL) {
+ print_error("MD2ref",
+ "MD:Z and CIGAR are incompatible");
+ return -1;
+ }
+
+ if (iref < ref_len)
+ ref[iref] = seq_nt16_str[bam_seqi(seq, iseq)];
+ iseq++;
+ iref++;
+ len--;
+ }
+ } else if (*MD == '^') {
+ // deletion
+ MD++;
+ while (*MD && isalpha(*MD)) {
+ if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+ &iseq, &cig_ind, &cig_op,
+ &cig_len)) < 0)
+ return -1;
+
+ if (next_op != BAM_CDEL) {
+ print_error("MD2ref",
+ "MD:Z and CIGAR are incompatible");
+ return -1;
+ }
+
+ if (iref < ref_len)
+ ref[iref] = *MD;
+
+ MD++;
+ iref++;
+ }
+ } else {
+ // substitution
+ if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip,
+ &iseq, &cig_ind, &cig_op,
+ &cig_len)) < 0)
+ return -1;
+
+ if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
+ print_error("MD2ref", "MD:Z and CIGAR are incompatible");
+ return -1;
+ }
+ if (iref < ref_len)
+ ref[iref] = *MD;
+
+ MD++;
+ iref++;
+ iseq++;
+ }
+ }
+
+ return 1;
+}
+
+static int MD2ref(samFile *in, sam_hdr_t *h, hts_idx_t *idx, char *reg,
+ FILE *outfp, int verbose) {
+ bam1_t *b = bam_init1();
+ int r, last_tid = -99;
+ size_t ref_len = 0;
+ char *ref = NULL;
+ int ret = -1;
+
+ hts_itr_t *iter = NULL;
+ if (idx && reg) {
+ iter = sam_itr_querys(idx, h, reg);
+ if (!iter) {
+ print_error("reference", "failed to parse region '%s'", reg);
+ goto err;
+ }
+ }
+
+ while ((r = iter
+ ? sam_itr_next(in, iter, b)
+ : sam_read1(in, h, b)) >= 0) {
+ // check b->core.tid and flush old seq.
+ if (b->core.tid != last_tid) {
+ if (last_tid >= 0)
+ if (dump_ref(h, iter, last_tid, ref, ref_len, outfp,
+ verbose) < 0)
+ goto err;
+
+ last_tid = b->core.tid;
+ ref_len = sam_hdr_tid2len(h, last_tid);
+ if (ref_len) {
+ char *ref2 = realloc(ref, ref_len);
+ if (!ref2)
+ goto err;
+ else
+ ref = ref2;
+ memset(ref, 'N', ref_len);
+ }
+ }
+
+ if (build_ref(b, ref, ref_len) < 0)
+ goto err;
+ }
+
+ if (last_tid >= 0) {
+ if (dump_ref(h, iter, last_tid, ref, ref_len, outfp, verbose) < 0)
+ goto err;
+ } else if (reg) {
+ // no data present, but we explicitly asked for the reference so
+ // report it still as Ns.
+ ref_len = MIN(iter->end, sam_hdr_tid2len(h, iter->tid));
+ ref = malloc(ref_len);
+ memset(ref, 'N', ref_len);
+ if (!ref)
+ goto err;
+ if (dump_ref(h, iter, iter->tid, ref, ref_len, outfp, verbose) < 0)
+ goto err;
+ }
+
+ if (r < -1)
+ goto err;
+
+ ret = 0;
+
+ err:
+ if (iter)
+ hts_itr_destroy(iter);
+ bam_destroy1(b);
+ free(ref);
+ return ret;
+}
+
+int main_reference(int argc, char *argv[])
+{
+ int c, usage = 0, verbose = 1, use_embedded = 0;
+ sam_hdr_t *h = 0;
+ samFile *in = NULL;
+ hts_idx_t *idx = NULL;
+ sam_global_args ga;
+ FILE *outfp = samtools_stdout;
+ char *reg = NULL;
+
+ static const struct option lopts[] = {
+ {"output", required_argument, NULL, 'o'},
+ {"quiet", no_argument, NULL, 'q'},
+ {"embedded", no_argument, NULL, 'e'},
+ {"region", required_argument, NULL, 'r'},
+ SAM_OPT_GLOBAL_OPTIONS('-', '-', '-', '-', '-', '@'),
+ { NULL, 0, NULL, 0 }
+ };
+
+ sam_global_args_init(&ga);
+
+ while ((c = getopt_long(argc, argv, "@:qo:er:", lopts, NULL)) >= 0) {
+ switch (c) {
+ case 'o':
+ if (!(outfp = fopen(optarg, "w"))) {
+ perror(optarg);
+ goto err;
+ }
+ break;
+
+ case 'q':
+ verbose = 0;
+ break;
+
+ case 'e':
+ use_embedded = 1;
+ break;
+
+ case 'r':
+ reg = optarg;
+ break;
+
+ default:
+ if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?': usage=1; break;
+ }
+ }
+
+ if ((optind == argc && isatty(0)) || usage) {
+ fprintf(samtools_stdout, "Usage: samtools reference [-@ N] [-r region] [-e] [-q] [-o out.fa] [in.cram]\n");
+ return 0;
+ }
+
+ char *fn = optind < argc ? argv[optind] : "-";
+ if (!(in = sam_open(fn, "r"))) {
+ print_error_errno("reference", "failed to open file '%s'", fn);
+ return 1;
+ }
+
+ if (ga.nthreads > 0)
+ hts_set_threads(in, ga.nthreads);
+
+ if (!(h = sam_hdr_read(in)))
+ goto err;
+
+ if (reg) {
+ idx = sam_index_load(in, fn);
+ if (!idx) {
+ print_error_errno("reference", "Failed to load the index");
+ goto err;
+ }
+ }
+
+ int ret = use_embedded
+ ? cram2ref(in, h, idx, reg, outfp, verbose)
+ : MD2ref(in, h, idx, reg, outfp, verbose);
+
+ sam_hdr_destroy(h);
+ if (outfp != samtools_stdout)
+ fclose(outfp);
+ if (idx)
+ hts_idx_destroy(idx);
+ sam_close(in);
+
+ return ret;
+
+ err:
+ if (idx)
+ hts_idx_destroy(idx);
+ if (in)
+ sam_close(in);
+ if (h)
+ sam_hdr_destroy(h);
+
+ return 1;
+}
conf->count++;
} else if (conf->unmap) {
b->core.flag |= BAM_FUNMAP;
+ b->core.qual = 0;
+ b->core.isize = 0;
+
+ // remove CIGAR
+ if (b->core.n_cigar) {
+ memmove(bam_get_cigar(b), bam_get_seq(b),
+ b->data + b->l_data - bam_get_seq(b));
+ b->l_data -= 4*b->core.n_cigar;
+ b->core.n_cigar = 0;
+ }
+
if (check_sam_write1(conf->out, conf->header,
b, conf->fn_out, write_error) < 0) {
return -1;
print_error_errno("view", "could not allocate bam record");
return 1;
}
+ errno = 0; // prevent false error messages.
while ((r = sam_read1(conf->in, conf->header, b)) >= 0) {
if (process_one_record(conf, b, &write_error) < 0) break;
}
bam_destroy1(b);
if (result < -1) {
- print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid);
+ print_error("view", "retrieval of region #%d failed", iter->curr_tid);
return 1;
}
return write_error;
conf->count++;
} else if (conf->unmap) {
b->core.flag |= BAM_FUNMAP;
+ b->core.qual = 0;
+ b->core.isize = 0;
+
+ // remove CIGAR
+ if (b->core.n_cigar) {
+ memmove(bam_get_cigar(b), bam_get_seq(b),
+ b->data + b->l_data - bam_get_seq(b));
+ b->l_data -= 4*b->core.n_cigar;
+ b->core.n_cigar = 0;
+ }
+
if (check_sam_write1(conf->out, conf->header,
b, conf->fn_out, write_error) < 0) {
return -1;
print_error_errno("view", "could not allocate bam record");
return 1;
}
+ errno = 0; // prevent false error messages.
while ((r = sam_read1(conf->in, conf->header, b)) >= 0) {
if (process_one_record(conf, b, &write_error) < 0) break;
}
bam_destroy1(b);
if (result < -1) {
- print_error("view", "retrieval of region %d failed due to truncated file or corrupt BAM index file", iter->curr_tid);
+ print_error("view", "retrieval of region #%d failed", iter->curr_tid);
return 1;
}
return write_error;
# DEALINGS IN THE SOFTWARE.
# Master version, for use in tarballs or non-git source copies
-VERSION=1.15.1
+VERSION=1.16.1
# If we have a git clone, then check against the current tag
if [ -e .git ]
return False
+def run_make(targets):
+ sys.stdout.flush()
+ subprocess.check_call([os.environ.get("MAKE", "make")] + targets)
+
+
def run_make_print_config():
stdout = subprocess.check_output(["make", "-s", "print-config"])
if IS_PYTHON3:
symbols = set()
for line in stdout.splitlines():
(sym, symtype) = line.split()[:2]
- if symtype not in "UFWw":
+ if symtype not in "UFNWw":
if IS_DARWIN:
# On macOS, all symbols have a leading underscore
symbols.add(sym.lstrip('_'))
else:
# Ignore symbols such as _edata (present in all shared objects)
- if not sym.startswith('_'): symbols.add(sym)
+ if sym[0] not in "_$.@": symbols.add(sym)
return symbols
optionise('-D', kvtuples(ext.define_macros)) +
optionise('-U', ext.undef_macros))
- cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + ext.extra_compile_args)
+ cflags = " ".join(sc('CFLAGS') + env('CFLAGS') + sc('CCSHARED') +
+ ext.extra_compile_args)
# distutils actually includes $CPPFLAGS here too, but that's weird and
# unnecessary for us as we know the output LDFLAGS will be used correctly
print("# pysam: (env) {}={}".format(var, os.environ[var]))
elif var in sysconfig.get_config_vars():
value = sysconfig.get_config_var(var)
+ if var == 'CFLAGS' and 'CCSHARED' in sysconfig.get_config_vars():
+ value += ' ' + sysconfig.get_config_var('CCSHARED')
print("# pysam: (sysconfig) {}={}".format(var, value))
os.environ[var] = value
tmp_vars += [var]
return None
-def distutils_dir_name(dname):
- """Returns the name of a distutils build directory
- see: http://stackoverflow.com/questions/14320220/
- testing-python-c-libraries-get-build-path
- """
- f = "{dirname}.{platform}-{version[0]}.{version[1]}"
- return f.format(dirname=dname,
- platform=sysconfig.get_platform(),
- version=sys.version_info)
-
-
def get_pysam_version():
sys.path.insert(0, "pysam")
import version
def run(self):
build.run(self)
try:
- self.check_ext_symbol_conflicts()
+ if HTSLIB_MODE != 'separate':
+ self.check_ext_symbol_conflicts()
except OSError as e:
log.warn("skipping symbol collision check (invoking nm failed: %s)", e)
except subprocess.CalledProcessError:
for header in headers:
os.remove(header)
+ objects = (glob.glob(os.path.join("htslib", "*.[oa]")) +
+ glob.glob(os.path.join("htslib", "cram", "*.o")) +
+ glob.glob(os.path.join("htslib", "htscodecs", "htscodecs", "*.o")))
+ if objects:
+ log.info("removing 'htslib/**/*.o' and libhts.a (%s objects)", len(objects))
+ for obj in objects:
+ os.remove(obj)
+
# How to link against HTSLIB
# shared: build shared chtslib from builtin htslib code.
external_htslib_libraries.extend(
[re.sub("^-l", "", x) for x in htslib_make_options["LIBS"].split(" ") if x.strip()])
- shared_htslib_sources = [re.sub("\.o", ".c", os.path.join("htslib", x))
- for x in
- htslib_make_options["LIBHTS_OBJS"].split(" ")]
-
- htslib_sources = []
-
if HTSLIB_LIBRARY_DIR:
- # linking against a shared, externally installed htslib version, no
- # sources required for htslib
- htslib_sources = []
- shared_htslib_sources = []
+ # linking against a shared, externally installed htslib version,
+ # no sources or built libhts.a required for htslib
+ htslib_objects = []
+ separate_htslib_objects = []
chtslib_sources = []
htslib_library_dirs = [HTSLIB_LIBRARY_DIR]
htslib_include_dirs = [HTSLIB_INCLUDE_DIR]
elif HTSLIB_MODE == 'separate':
# add to each pysam component a separately compiled
# htslib
- htslib_sources = shared_htslib_sources
- shared_htslib_sources = htslib_sources
+ htslib_objects = ['htslib/libhts.a']
+ separate_htslib_objects = ['htslib/libhts.a']
htslib_library_dirs = []
htslib_include_dirs = ['htslib']
elif HTSLIB_MODE == 'shared':
# link each pysam component against the same
# htslib built from sources included in the pysam
# package.
- htslib_library_dirs = [
- "pysam", # when using setup.py develop?
- ".", # when using setup.py develop?
- os.path.join("build", distutils_dir_name("lib"), "pysam")]
+ # Link with the object files rather than the final htslib/libhts.a, to ensure that
+ # all object files are pulled into the link, even those not used by htslib itself.
+ htslib_objects = [os.path.join("htslib", x)
+ for x in htslib_make_options["LIBHTS_OBJS"].split(" ")]
+ separate_htslib_objects = []
+
+ htslib_library_dirs = ["."] # when using setup.py develop?
htslib_include_dirs = ['htslib']
else:
raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
def prebuild_libchtslib(ext, force):
if HTSLIB_MODE not in ['shared', 'separate']: return
+
write_configvars_header("htslib/config_vars.h", ext, "HTS")
+ if force or not os.path.exists("htslib/libhts.a"):
+ log.info("building 'libhts.a'")
+ with changedir("htslib"):
+ # TODO Eventually by running configure here, we can set these
+ # extra flags for configure instead of hacking on ALL_CPPFLAGS.
+ args = " ".join(ext.extra_compile_args)
+ run_make(["ALL_CPPFLAGS=-I. " + args + " $(CPPFLAGS)", "lib-static"])
+ else:
+ log.warn("skipping 'libhts.a' (already built)")
+
+
def prebuild_libcsamtools(ext, force):
write_configvars_header("samtools/samtools_config_vars.h", ext, "SAMTOOLS")
+
modules = [
dict(name="pysam.libchtslib",
prebuild_func=prebuild_libchtslib,
- sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + shared_htslib_sources + os_c_files,
+ sources=[source_pattern % "htslib", "pysam/htslib_util.c"] + os_c_files,
+ extra_objects=htslib_objects,
libraries=external_htslib_libraries),
dict(name="pysam.libcsamtools",
prebuild_func=prebuild_libcsamtools,
sources=[source_pattern % "samtools"] + glob.glob(os.path.join("samtools", "*.pysam.c")) +
- [os.path.join("samtools", "lz4", "lz4.c")] + htslib_sources + os_c_files,
+ [os.path.join("samtools", "lz4", "lz4.c")] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=external_htslib_libraries + internal_htslib_libraries),
dict(name="pysam.libcbcftools",
- sources=[source_pattern % "bcftools"] + glob.glob(os.path.join("bcftools", "*.pysam.c")) + htslib_sources + os_c_files,
+ sources=[source_pattern % "bcftools"] + glob.glob(os.path.join("bcftools", "*.pysam.c")) + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=external_htslib_libraries + internal_htslib_libraries),
dict(name="pysam.libcutils",
- sources=[source_pattern % "utils", "pysam/pysam_util.c"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "utils", "pysam/pysam_util.c"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=external_htslib_libraries + internal_htslib_libraries + internal_samtools_libraries),
dict(name="pysam.libcalignmentfile",
- sources=[source_pattern % "alignmentfile"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "alignmentfile"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libcsamfile",
- sources=[source_pattern % "samfile"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "samfile"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libcalignedsegment",
- sources=[source_pattern % "alignedsegment"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "alignedsegment"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libctabix",
- sources=[source_pattern % "tabix"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "tabix"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libcfaidx",
- sources=[source_pattern % "faidx"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "faidx"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libcbcf",
- sources=[source_pattern % "bcf"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "bcf"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libcbgzf",
- sources=[source_pattern % "bgzf"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "bgzf"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libctabixproxies",
- sources=[source_pattern % "tabixproxies"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "tabixproxies"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
dict(name="pysam.libcvcf",
- sources=[source_pattern % "vcf"] + htslib_sources + os_c_files,
+ sources=[source_pattern % "vcf"] + os_c_files,
+ extra_objects=separate_htslib_objects,
libraries=libraries_for_pysam_module),
]
define_macros=define_macros,
# for out-of-tree compilation, use absolute paths
library_dirs=[os.path.abspath(x) for x in ["pysam"] + htslib_library_dirs],
- include_dirs=[os.path.abspath(x) for x in htslib_include_dirs + \
- ["samtools", "samtools/lz4", "bcftools", "pysam", "."] + include_os])
+ include_dirs=[os.path.abspath(x) for x in ["pysam"] + htslib_include_dirs + \
+ ["samtools", "samtools/lz4", "bcftools", "."] + include_os])
# add common options (in python >3.5, could use n = {**a, **b}
for module in modules:
'''
import unittest
+import pytest
import os
import shutil
import sys
pysam.AlignmentFile,
os.path.join(BAM_DATADIR, 'ex2_truncated.bam'))
+ @pytest.mark.filterwarnings('ignore:no BGZF EOF marker')
def testTruncatedBamIterator(self):
s = pysam.AlignmentFile(os.path.join(BAM_DATADIR, 'ex2_truncated.bam'),
ignore_truncation=True)
assert str(record)[:-1].split("\t")[-2:] == [
"anno1",
"Friedrich-Alexander-Universit\u00E4t_Erlangen-N\u00FCrnberg"]
+
+def test_set_sample_alleles(vcf_header):
+ vcf_header.formats.add('GT',1,'String',"Genotype") # id, number, type, description
+ record = vcf_header.new_record(
+ contig="1",
+ start=20,
+ stop=21,
+ alleles=('A','T')
+ )
+
+ record.samples['sample1'].alleles = ('T', 'A')
+ assert record.samples['sample1'].alleles == ('T','A')
+
+ # Empty record:
+ record.samples['sample1'].alleles = (None, )
+ assert record.samples['sample1'].alleles == tuple()
+ record.samples['sample1'].alleles = None
+ assert record.samples['sample1'].alleles == tuple()
+ record.samples['sample1'].alleles = tuple()
+ assert record.samples['sample1'].alleles == tuple()
+
+ # check error conditions:
+ with pytest.raises(ValueError, match='One or more of the supplied sample alleles are not defined'):
+ record.samples['sample1'].alleles = ('C', 'A')
+
+ with pytest.raises(ValueError, match='Use .allele_indices to set integer allele indices'):
+ record.samples['sample1'].alleles = (1, 0)
samtools mpileup -f ex1.fa ex1.bam | gzip > ex1.pileup.gz
ex2_truncated.bam: ex2.bam
- head -c 124000 ex2.bam > ex2_truncated.bam
+ dd if=ex2.bam of=ex2_truncated.bam bs=$$((`wc -c < ex2.bam`-512)) count=1
# Append a corrupt read with block_size < sizeof(bam_core_t fields)
ex2_corrupt.bam: ex2.bam
local = os.path.join(TABIX_DATADIR, "example.gtf.gz")
def setUp(self):
- if not pysam.config.HAVE_LIBCURL or not check_url(self.url):
+ if not getattr(pysam.config, "HAVE_LIBCURL", 0) or not check_url(self.url):
self.remote_file = None
else:
self.remote_file = pysam.TabixFile(self.url, "r")
local = os.path.join(TABIX_DATADIR, "example_comments.gtf.gz")
def setUp(self):
- if not pysam.config.HAVE_LIBCURL or not check_url(self.url):
+ if not getattr(pysam.config, "HAVE_LIBCURL", 0) or not check_url(self.url):
self.remote_file = None
else:
self.remote_file = pysam.TabixFile(self.url, "r")