pysam/config.py
# cython files
-pysam/TabProxies.c
-pysam/csamtools.c
-pysam/ctabix.c
-pysam/cvcf.c
-pysam/chtslib.c
-pysam/cutils.c
-pysam/calignedsegment.c
-pysam/calignmentfile.c
-pysam/cbcf.c
-pysam/cfaidx.c
-pysam/chtslib.c
-pysam/csamfile.c
-pysam/ctabixproxies.c
+pysam/libc*.c
###### Generic python ignores below ######
include THANKS
include cy_build.py
include requirements.txt
-include pysam/c*.pxd
-include pysam/c*.pyx
-include pysam/c*.c
+include pysam/libc*.pxd
+include pysam/libc*.pyx
+include pysam/libc*.c
include pysam/*.c
include pysam/*.h
include samtools/configure
include htslib/configure
include htslib/config.mk.in
include htslib/config.h.in
+include htslib/htslib.pc.in
include htslib/htslib/*.h
include htslib/cram/*.c
include htslib/cram/*.h
#include <htslib/hts.h>
#include "HMM.h"
+typedef struct
+{
+ int nstates; // number of hmm's states
+ int isite; // take snapshot at i-th position
+ uint32_t pos; // i-th site's position
+ double *vit_prob; // viterbi probabilities, NULL for uniform probs
+ double *fwd_prob; // transition probabilities
+ double *bwd_prob; // transition probabilities
+}
+snapshot_t;
+
struct _hmm_t
{
int nstates; // number of states
set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities
// at each site (one step of Viterbi algorithm)
void *set_tprob_data;
- double *init_probs; // Initial state probabilities, NULL for uniform probs
+ snapshot_t init; // Initial state probabilities. Set isite=1 when site should be used
+ snapshot_t *snapshot;
};
uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
memcpy(dst,out,sizeof(double)*n*n);
}
+void hmm_init_states(hmm_t *hmm, double *probs)
+{
+ hmm->init.isite = 0;
+ hmm->init.pos = 0;
+ if ( !hmm->init.vit_prob )
+ hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.fwd_prob )
+ hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.bwd_prob )
+ hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+
+ int i;
+ if ( probs )
+ {
+ memcpy(hmm->init.vit_prob,probs,sizeof(double)*hmm->nstates);
+ double sum = 0;
+ for (i=0; i<hmm->nstates; i++) sum += hmm->init.vit_prob[i];
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] /= sum;
+ }
+ else
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates;
+
+ memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+}
hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
{
hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t));
hmm->nstates = nstates;
hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates);
hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates);
-
hmm_set_tprob(hmm, tprob, ntprob);
-
+ hmm_init_states(hmm, NULL);
return hmm;
}
-void hmm_init_states(hmm_t *hmm, double *probs)
+void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
{
- if ( !probs )
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( snapshot && snapshot->nstates!=hmm->nstates )
{
- free(hmm->init_probs);
- hmm->init_probs = NULL;
+ free(snapshot);
+ snapshot = NULL;
}
-
- if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates);
- memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates);
+ if ( !snapshot )
+ {
+ // Allocate the snapshot as a single memory block so that it can be
+ // free()-ed by the user. So make sure the arrays are aligned..
+ size_t str_size = sizeof(snapshot_t);
+ size_t dbl_size = sizeof(double);
+ size_t pad_size = (dbl_size - str_size % dbl_size) % dbl_size;
+ uint8_t *mem = (uint8_t*) malloc(str_size + pad_size + dbl_size*2*hmm->nstates);
+ snapshot = (snapshot_t*) mem;
+ snapshot->nstates = hmm->nstates;
+ snapshot->vit_prob = (double*) (mem + str_size + pad_size);
+ snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates;
+ }
+ snapshot->isite = isite;
+ hmm->snapshot = snapshot;
+ return snapshot;
+}
+void hmm_restore(hmm_t *hmm, void *_snapshot)
+{
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( !snapshot )
+ {
+ hmm->init.isite = 0;
+ return;
+ }
+ hmm->init.isite = 1;
+ hmm->init.pos = snapshot->pos;
+ memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
}
void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
}
-
// Init all states with equal likelihood
int i,j, nstates = hmm->nstates;
- if ( hmm->init_probs )
- for (i=0; i<nstates; i++) hmm->vprob[i] = hmm->init_probs[i];
- else
- for (i=0; i<nstates; i++) hmm->vprob[i] = 1./nstates;
+ memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run Viterbi
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
uint8_t *vpath = &hmm->vpath[i*nstates];
double *eprob = &eprobs[i*nstates];
int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
-
_set_tprob(hmm, pos_diff);
if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
prev_pos = sites[i];
}
for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
+
+ if ( hmm->snapshot && i==hmm->snapshot->isite )
+ {
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates);
+ }
}
// Find the most likely state
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
for (j=0; j<nstates; j++) fwd[j] /= norm;
}
+ if ( hmm->snapshot )
+ {
+ i = hmm->snapshot->isite;
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates);
+ }
+
// Run bwd
double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
prev_pos = sites[n-1];
}
}
-void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
{
// Init arrays when run for the first time
if ( hmm->nfwd < n )
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// New transition matrix: temporary values
double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
double *fwd_bwd = (double*) malloc(sizeof(double)*nstates);
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
free(tmp_gamma);
free(tmp_xi);
free(fwd_bwd);
+ return hmm->curr_tprob;
}
void hmm_destroy(hmm_t *hmm)
{
- free(hmm->init_probs);
+ free(hmm->init.vit_prob);
+ free(hmm->init.fwd_prob);
+ free(hmm->init.bwd_prob);
free(hmm->vprob);
free(hmm->vprob_tmp);
free(hmm->vpath);
#include <htslib/hts.h>
#include "HMM.h"
+typedef struct
+{
+ int nstates; // number of hmm's states
+ int isite; // take snapshot at i-th position
+ uint32_t pos; // i-th site's position
+ double *vit_prob; // viterbi probabilities, NULL for uniform probs
+ double *fwd_prob; // transition probabilities
+ double *bwd_prob; // transition probabilities
+}
+snapshot_t;
+
struct _hmm_t
{
int nstates; // number of states
set_tprob_f set_tprob; // Optional user function to set / modify transition probabilities
// at each site (one step of Viterbi algorithm)
void *set_tprob_data;
- double *init_probs; // Initial state probabilities, NULL for uniform probs
+ snapshot_t init; // Initial state probabilities. Set isite=1 when site should be used
+ snapshot_t *snapshot;
};
uint8_t *hmm_get_viterbi_path(hmm_t *hmm) { return hmm->vpath; }
memcpy(dst,out,sizeof(double)*n*n);
}
+void hmm_init_states(hmm_t *hmm, double *probs)
+{
+ hmm->init.isite = 0;
+ hmm->init.pos = 0;
+ if ( !hmm->init.vit_prob )
+ hmm->init.vit_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.fwd_prob )
+ hmm->init.fwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+ if ( !hmm->init.bwd_prob )
+ hmm->init.bwd_prob = (double*) malloc(sizeof(double)*hmm->nstates);
+
+ int i;
+ if ( probs )
+ {
+ memcpy(hmm->init.vit_prob,probs,sizeof(double)*hmm->nstates);
+ double sum = 0;
+ for (i=0; i<hmm->nstates; i++) sum += hmm->init.vit_prob[i];
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] /= sum;
+ }
+ else
+ for (i=0; i<hmm->nstates; i++) hmm->init.vit_prob[i] = 1./hmm->nstates;
+
+ memcpy(hmm->init.fwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.bwd_prob,hmm->init.vit_prob,sizeof(double)*hmm->nstates);
+}
hmm_t *hmm_init(int nstates, double *tprob, int ntprob)
{
hmm_t *hmm = (hmm_t*) calloc(1,sizeof(hmm_t));
hmm->nstates = nstates;
hmm->curr_tprob = (double*) malloc(sizeof(double)*nstates*nstates);
hmm->tmp = (double*) malloc(sizeof(double)*nstates*nstates);
-
hmm_set_tprob(hmm, tprob, ntprob);
-
+ hmm_init_states(hmm, NULL);
return hmm;
}
-void hmm_init_states(hmm_t *hmm, double *probs)
+void *hmm_snapshot(hmm_t *hmm, void *_snapshot, int isite)
{
- if ( !probs )
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( snapshot && snapshot->nstates!=hmm->nstates )
{
- free(hmm->init_probs);
- hmm->init_probs = NULL;
+ free(snapshot);
+ snapshot = NULL;
}
-
- if ( !hmm->init_probs ) hmm->init_probs = (double*) malloc(sizeof(double)*hmm->nstates);
- memcpy(hmm->init_probs,probs,sizeof(double)*hmm->nstates);
+ if ( !snapshot )
+ {
+ // Allocate the snapshot as a single memory block so that it can be
+ // free()-ed by the user. So make sure the arrays are aligned..
+ size_t str_size = sizeof(snapshot_t);
+ size_t dbl_size = sizeof(double);
+ size_t pad_size = (dbl_size - str_size % dbl_size) % dbl_size;
+ uint8_t *mem = (uint8_t*) malloc(str_size + pad_size + dbl_size*2*hmm->nstates);
+ snapshot = (snapshot_t*) mem;
+ snapshot->nstates = hmm->nstates;
+ snapshot->vit_prob = (double*) (mem + str_size + pad_size);
+ snapshot->fwd_prob = snapshot->vit_prob + hmm->nstates;
+ }
+ snapshot->isite = isite;
+ hmm->snapshot = snapshot;
+ return snapshot;
+}
+void hmm_restore(hmm_t *hmm, void *_snapshot)
+{
+ snapshot_t *snapshot = (snapshot_t*) _snapshot;
+ if ( !snapshot )
+ {
+ hmm->init.isite = 0;
+ return;
+ }
+ hmm->init.isite = 1;
+ hmm->init.pos = snapshot->pos;
+ memcpy(hmm->init.vit_prob,snapshot->vit_prob,sizeof(double)*hmm->nstates);
+ memcpy(hmm->init.fwd_prob,snapshot->fwd_prob,sizeof(double)*hmm->nstates);
}
void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob)
hmm->vprob_tmp = (double*) malloc(sizeof(double)*hmm->nstates);
}
-
// Init all states with equal likelihood
int i,j, nstates = hmm->nstates;
- if ( hmm->init_probs )
- for (i=0; i<nstates; i++) hmm->vprob[i] = hmm->init_probs[i];
- else
- for (i=0; i<nstates; i++) hmm->vprob[i] = 1./nstates;
+ memcpy(hmm->vprob, hmm->init.vit_prob, sizeof(*hmm->init.vit_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run Viterbi
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
uint8_t *vpath = &hmm->vpath[i*nstates];
double *eprob = &eprobs[i*nstates];
int pos_diff = sites[i] == prev_pos ? 0 : sites[i] - prev_pos - 1;
-
_set_tprob(hmm, pos_diff);
if ( hmm->set_tprob ) hmm->set_tprob(hmm, prev_pos, sites[i], hmm->set_tprob_data, hmm->curr_tprob);
prev_pos = sites[i];
}
for (j=0; j<nstates; j++) hmm->vprob_tmp[j] /= vnorm;
double *tmp = hmm->vprob; hmm->vprob = hmm->vprob_tmp; hmm->vprob_tmp = tmp;
+
+ if ( hmm->snapshot && i==hmm->snapshot->isite )
+ {
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->vit_prob, hmm->vprob, sizeof(*hmm->vprob)*nstates);
+ }
}
// Find the most likely state
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
for (j=0; j<nstates; j++) fwd[j] /= norm;
}
+ if ( hmm->snapshot )
+ {
+ i = hmm->snapshot->isite;
+ hmm->snapshot->pos = sites[i];
+ memcpy(hmm->snapshot->fwd_prob, hmm->fwd + (i+1)*nstates, sizeof(*hmm->fwd)*nstates);
+ }
+
// Run bwd
double *bwd = hmm->bwd, *bwd_tmp = hmm->bwd_tmp;
prev_pos = sites[n-1];
}
}
-void hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
+double *hmm_run_baum_welch(hmm_t *hmm, int n, double *eprobs, uint32_t *sites)
{
// Init arrays when run for the first time
if ( hmm->nfwd < n )
// Init all states with equal likelihood
int i,j,k, nstates = hmm->nstates;
- if ( hmm->init_probs )
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = hmm->init_probs[i];
- for (i=0; i<nstates; i++) hmm->bwd[i] = hmm->init_probs[i];
- }
- else
- {
- for (i=0; i<nstates; i++) hmm->fwd[i] = 1./hmm->nstates;
- for (i=0; i<nstates; i++) hmm->bwd[i] = 1./hmm->nstates;
- }
+ memcpy(hmm->fwd, hmm->init.fwd_prob, sizeof(*hmm->init.fwd_prob)*nstates);
+ memcpy(hmm->bwd, hmm->init.bwd_prob, sizeof(*hmm->init.bwd_prob)*nstates);
+ uint32_t prev_pos = hmm->init.isite ? hmm->init.pos : sites[0];
// New transition matrix: temporary values
double *tmp_xi = (double*) calloc(nstates*nstates,sizeof(double));
double *fwd_bwd = (double*) malloc(sizeof(double)*nstates);
// Run fwd
- uint32_t prev_pos = sites[0];
for (i=0; i<n; i++)
{
double *fwd_prev = &hmm->fwd[i*nstates];
free(tmp_gamma);
free(tmp_xi);
free(fwd_bwd);
+ return hmm->curr_tprob;
}
void hmm_destroy(hmm_t *hmm)
{
- free(hmm->init_probs);
+ free(hmm->init.vit_prob);
+ free(hmm->init.fwd_prob);
+ free(hmm->init.bwd_prob);
free(hmm->vprob);
free(hmm->vprob_tmp);
free(hmm->vpath);
hmm_t *hmm_init(int nstates, double *tprob, int ntprob);
void hmm_set_tprob(hmm_t *hmm, double *tprob, int ntprob);
+#define HMM_VIT 1
+#define HMM_FWD 2
+#define HMM_BWD 4
+
/**
* hmm_init_states() - initial state probabilities
* @probs: initial state probabilities or NULL to reset to default
*/
void hmm_init_states(hmm_t *hmm, double *probs);
+/**
+ * hmm_snapshot() - take the model's snapshot, intended for sliding HMM
+ * @snapshot: NULL or snapshot returned by previous hmm_snapshot() call, must be free()-ed by the caller
+ * @isite: take the snapshot at i-th step
+ */
+void *hmm_snapshot(hmm_t *hmm, void *snapshot, int isite);
+
+/**
+ * hmm_restore() - restore model's snapshot, intended for sliding HMM
+ * @snapshot: snapshot returned by hmm_snapshot() call or NULL to reset
+ * @isite: take the snapshot at i-th step
+ */
+void hmm_restore(hmm_t *hmm, void *snapshot);
+
/**
* hmm_get_tprob() - return the array of transition matrices, precalculated
* to ntprob positions. The first matrix is the initial tprob matrix
* @eprob: emission probabilities for each site and state (nsites x nstates)
* @sites: list of positions
*
- * Same as hmm_run_fwd_bwd, in addition curr_tprob contains the new
- * transition probabilities. In this verison, emission probabilities
- * are not updated.
+ * Same as hmm_run_fwd_bwd, in addition a pointer to a matrix with the new
+ * transition probabilities is returned. In this verison, emission
+ * probabilities are not updated.
*/
-void hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
+double *hmm_run_baum_welch(hmm_t *hmm, int nsites, double *eprob, uint32_t *sites);
void hmm_destroy(hmm_t *hmm);
--- /dev/null
+/* bam2bcf.c -- variant calling.
+
+ Copyright (C) 2010-2012 Broad Institute.
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+#include <float.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/kstring.h>
+#include <htslib/kfunc.h>
+#include "bam2bcf.h"
+
+extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
+
+#define CALL_DEFTHETA 0.83
+#define DEF_MAPQ 20
+
+#define CAP_DIST 25
+
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+{
+ bcf_callaux_t *bca;
+ if (theta <= 0.) theta = CALL_DEFTHETA;
+ bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
+ bca->capQ = 60;
+ bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
+ bca->min_baseQ = min_baseQ;
+ bca->e = errmod_init(1. - theta);
+ bca->min_frac = 0.002;
+ bca->min_support = 1;
+ bca->per_sample_flt = 0;
+ bca->npos = 100;
+ bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->nqual = 60;
+ bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
+ bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
+ return bca;
+}
+
+void bcf_call_destroy(bcf_callaux_t *bca)
+{
+ if (bca == 0) return;
+ errmod_destroy(bca->e);
+ if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
+ free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
+ free(bca->fwd_mqs); free(bca->rev_mqs);
+ bca->nqual = 0;
+ free(bca->bases); free(bca->inscns); free(bca);
+}
+
+// position in the sequence with respect to the aligned part of the read
+static int get_position(const bam_pileup1_t *p, int *len)
+{
+ int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
+ for (icig=0; icig<p->b->core.n_cigar; icig++)
+ {
+ int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
+ int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
+ if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
+ {
+ n_tot_bases += ncig;
+ iread += ncig;
+ continue;
+ }
+ if ( cig==BAM_CINS )
+ {
+ n_tot_bases += ncig;
+ iread += ncig;
+ continue;
+ }
+ if ( cig==BAM_CSOFT_CLIP )
+ {
+ iread += ncig;
+ if ( iread<=p->qpos ) edist -= ncig;
+ continue;
+ }
+ if ( cig==BAM_CDEL ) continue;
+ if ( cig==BAM_CHARD_CLIP ) continue;
+ if ( cig==BAM_CPAD ) continue;
+ if ( cig==BAM_CREF_SKIP ) continue;
+ fprintf(stderr,"todo: cigar %d\n", cig);
+ assert(0);
+ }
+ *len = n_tot_bases;
+ return edist;
+}
+
+void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
+{
+ memset(bca->ref_pos,0,sizeof(int)*bca->npos);
+ memset(bca->alt_pos,0,sizeof(int)*bca->npos);
+ memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
+ memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
+ memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
+ memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
+ if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+ if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+}
+
+/*
+ Notes:
+ - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
+ which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
+ Later it's used for multiallelic calling by bcftools -m
+ - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
+ */
+/*
+ * This function is called once for each sample.
+ * _n is number of pilesups pl contributing reads to this sample
+ * pl is pointer to array of _n pileups (one pileup per read)
+ * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
+ * bca is the settings to perform calls across all samples
+ * r is the returned value of the call
+ */
+int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
+{
+ int i, n, ref4, is_indel, ori_depth = 0;
+
+ // clean from previous run
+ r->ori_depth = 0;
+ r->mq0 = 0;
+ memset(r->qsum,0,sizeof(float)*4);
+ memset(r->anno,0,sizeof(double)*16);
+ memset(r->p,0,sizeof(float)*25);
+
+ if (ref_base >= 0) {
+ ref4 = seq_nt16_int[ref_base];
+ is_indel = 0;
+ } else ref4 = 4, is_indel = 1;
+ if (_n == 0) return -1;
+ // enlarge the bases array if necessary
+ if (bca->max_bases < _n) {
+ bca->max_bases = _n;
+ kroundup32(bca->max_bases);
+ bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
+ }
+ // fill the bases array
+ for (i = n = 0; i < _n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+ // set base
+ if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+ ++ori_depth;
+ mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
+ if ( !mapQ ) r->mq0++;
+ baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
+ seqQ = is_indel? (p->aux>>8&0xff) : 99;
+ if (q < bca->min_baseQ) continue;
+ if (q > seqQ) q = seqQ;
+ mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
+ if (q > mapQ) q = mapQ;
+ if (q > 63) q = 63;
+ if (q < 4) q = 4; // MQ=0 reads count as BQ=4
+ if (!is_indel) {
+ b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
+ b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
+ is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+ } else {
+ b = p->aux>>16&0x3f;
+ is_diff = (b != 0);
+ }
+ bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
+ // collect annotations
+ if (b < 4)
+ {
+ r->qsum[b] += q;
+ if ( r->ADF )
+ {
+ if ( bam_is_rev(p->b) )
+ r->ADR[b]++;
+ else
+ r->ADF[b]++;
+ }
+ }
+ ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
+ min_dist = p->b->core.l_qseq - 1 - p->qpos;
+ if (min_dist > p->qpos) min_dist = p->qpos;
+ if (min_dist > CAP_DIST) min_dist = CAP_DIST;
+ r->anno[1<<2|is_diff<<1|0] += baseQ;
+ r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
+ r->anno[2<<2|is_diff<<1|0] += mapQ;
+ r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
+ r->anno[3<<2|is_diff<<1|0] += min_dist;
+ r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
+
+ // collect for bias tests
+ if ( baseQ > 59 ) baseQ = 59;
+ if ( mapQ > 59 ) mapQ = 59;
+ int len, pos = get_position(p, &len);
+ int epos = (double)pos/(len+1) * bca->npos;
+ int ibq = baseQ/60. * bca->nqual;
+ int imq = mapQ/60. * bca->nqual;
+ if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
+ else bca->fwd_mqs[imq]++;
+ if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
+ {
+ bca->ref_pos[epos]++;
+ bca->ref_bq[ibq]++;
+ bca->ref_mq[imq]++;
+ }
+ else
+ {
+ bca->alt_pos[epos]++;
+ bca->alt_bq[ibq]++;
+ bca->alt_mq[imq]++;
+ }
+ }
+ r->ori_depth = ori_depth;
+ // glfgen
+ errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
+ return n;
+}
+
+
+/*
+ * calc_vdb() - returns value between zero (most biased) and one (no bias)
+ * on success, or HUGE_VAL when VDB cannot be calculated because
+ * of insufficient depth (<2x)
+ *
+ * Variant Distance Bias tests if the variant bases are positioned within the
+ * reads with sufficient randomness. Unlike other tests, it looks only at
+ * variant reads and therefore gives different kind of information than Read
+ * Position Bias for instance. VDB was developed for detecting artefacts in
+ * RNA-seq calls where reads from spliced transcripts span splice site
+ * boundaries. The current implementation differs somewhat from the original
+ * version described in supplementary material of PMID:22524474, but the idea
+ * remains the same. (Here the random variable tested is the average distance
+ * from the averaged position, not the average pairwise distance.)
+ *
+ * For coverage of 2x, the calculation is exact but is approximated for the
+ * rest. The result is most accurate between 4-200x. For 3x or >200x, the
+ * reported values are slightly more favourable than those of a true random
+ * distribution.
+ */
+double calc_vdb(int *pos, int npos)
+{
+ // Note well: the parameters were obtained by fitting to simulated data of
+ // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
+ const int readlen = 100;
+ assert( npos==readlen );
+
+ #define nparam 15
+ const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
+ {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
+ {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
+ {200,0.7,23.7} };
+
+ int i, dp = 0;
+ float mean_pos = 0, mean_diff = 0;
+ for (i=0; i<npos; i++)
+ {
+ if ( !pos[i] ) continue;
+ dp += pos[i];
+ mean_pos += pos[i]*i;
+ }
+ if ( dp<2 ) return HUGE_VAL; // one or zero reads can be placed anywhere
+
+ mean_pos /= dp;
+ for (i=0; i<npos; i++)
+ {
+ if ( !pos[i] ) continue;
+ mean_diff += pos[i] * fabs(i - mean_pos);
+ }
+ mean_diff /= dp;
+
+ int ipos = mean_diff; // tuned for float-to-int implicit conversion
+ if ( dp==2 )
+ return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
+
+ if ( dp>=200 )
+ i = nparam; // shortcut for big depths
+ else
+ {
+ for (i=0; i<nparam; i++)
+ if ( param[i][0]>=dp ) break;
+ }
+ float pshift, pscale;
+ if ( i==nparam )
+ {
+ // the depth is too high, go with 200x
+ pscale = param[nparam-1][1];
+ pshift = param[nparam-1][2];
+ }
+ else if ( i>0 && param[i][0]!=dp )
+ {
+ // linear interpolation of parameters
+ pscale = (param[i-1][1] + param[i][1])*0.5;
+ pshift = (param[i-1][2] + param[i][2])*0.5;
+ }
+ else
+ {
+ pscale = param[i][1];
+ pshift = param[i][2];
+ }
+ return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
+}
+
+double calc_chisq_bias(int *a, int *b, int n)
+{
+ int na = 0, nb = 0, i, ndf = n;
+ for (i=0; i<n; i++) na += a[i];
+ for (i=0; i<n; i++) nb += b[i];
+ if ( !na || !nb ) return HUGE_VAL;
+
+ double chisq = 0;
+ for (i=0; i<n; i++)
+ {
+ if ( !a[i] && !b[i] ) ndf--;
+ else
+ {
+ double tmp = a[i] - b[i];
+ chisq += tmp*tmp/(a[i]+b[i]);
+ }
+ }
+ /*
+ kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
+ 1 if the distributions are identical, 0 if very different
+ */
+ double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
+ return prob;
+}
+
+static double mann_whitney_1947_(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
+}
+
+double mann_whitney_1947(int n, int m, int U)
+{
+ #include "mw.h"
+
+ assert(n >= 2 && m >= 2);
+
+ return (n < 8 && m < 8 && U < 50)
+ ? mw[n-2][m-2][U]
+ : mann_whitney_1947_(n,m,U);
+}
+
+double mann_whitney_1947_cdf(int n, int m, int U)
+{
+ int i;
+ double sum = 0;
+ for (i=0; i<=U; i++)
+ sum += mann_whitney_1947(n,m,i);
+ return sum;
+}
+
+double calc_mwu_bias_cdf(int *a, int *b, int n)
+{
+ int na = 0, nb = 0, i;
+ double U = 0, ties = 0;
+ for (i=0; i<n; i++)
+ {
+ na += a[i];
+ U += a[i] * (nb + b[i]*0.5);
+ nb += b[i];
+ if ( a[i] && b[i] )
+ {
+ double tie = a[i] + b[i];
+ ties += (tie*tie-1)*tie;
+ }
+ }
+ if ( !na || !nb ) return HUGE_VAL;
+
+ // Always work with the smaller U
+ double U_min = ((double)na * nb) - U;
+ if ( U < U_min ) U_min = U;
+
+ if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
+ if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
+
+ // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
+ if ( na>=8 || nb>=8 )
+ {
+ double mean = ((double)na*nb)*0.5;
+ // Correction for ties:
+ // double N = na+nb;
+ // double var2 = (N*N-1)*N-ties;
+ // if ( var2==0 ) return 1.0;
+ // var2 *= ((double)na*nb)/N/(N-1)/12.0;
+ // No correction for ties:
+ double var2 = ((double)na*nb)*(na+nb+1)/12.0;
+ double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
+ return 2.0 - kf_erfc(z); // which is 1 + erf(z)
+ }
+
+ // Exact calculation
+ double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
+ return pval>1 ? 1 : pval;
+}
+
+double calc_mwu_bias(int *a, int *b, int n)
+{
+ int na = 0, nb = 0, i;
+ double U = 0, ties = 0;
+ for (i=0; i<n; i++)
+ {
+ if (!a[i]) {
+ if (!b[i]) continue;
+ nb += b[i];
+ } else if (!b[i]) {
+ na += a[i];
+ U += a[i] * nb;
+ } else {
+ na += a[i];
+ U += a[i] * (nb + b[i]*0.5);
+ nb += b[i];
+ double tie = a[i] + b[i];
+ ties += (tie*tie-1)*tie;
+ }
+ }
+ if ( !na || !nb ) return HUGE_VAL;
+ if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
+
+ double mean = ((double)na*nb)*0.5;
+ if ( na==2 || nb==2 )
+ {
+ // Linear approximation
+ return U>mean ? (2.0*mean-U)/mean : U/mean;
+ }
+ // Correction for ties:
+ // double N = na+nb;
+ // double var2 = (N*N-1)*N-ties;
+ // if ( var2==0 ) return 1.0;
+ // var2 *= ((double)na*nb)/N/(N-1)/12.0;
+ // No correction for ties:
+ double var2 = ((double)na*nb)*(na+nb+1)/12.0;
+ if ( na>=8 || nb>=8 )
+ {
+ // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
+ return exp(-0.5*(U-mean)*(U-mean)/var2);
+ }
+
+ // Exact calculation
+ return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
+}
+
+static inline double logsumexp2(double a, double b)
+{
+ if ( a>b )
+ return log(1 + exp(b-a)) + a;
+ else
+ return log(1 + exp(a-b)) + b;
+}
+
+void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
+{
+ call->seg_bias = HUGE_VAL;
+ if ( !bcr ) return;
+
+ int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
+ if ( !nr ) return;
+
+ int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth
+ double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population
+ if ( M>call->n ) M = call->n; // clamp M at the number of samples
+ else if ( M==0 ) M = 1;
+ double f = M / 2. / call->n; // allele frequency
+ double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson)
+ double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson)
+ double sum = 0;
+ const double log2 = log(2.0);
+
+ // fprintf(stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
+ int i;
+ for (i=0; i<call->n; i++)
+ {
+ int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads
+ double tmp;
+ if ( oi )
+ {
+ // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
+ tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
+ tmp += log(f) + oi*log(q/p) - q + p;
+ }
+ else
+ tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
+ sum += tmp;
+ // fprintf(stderr,"oi=%d %e\n", oi,tmp);
+ }
+ call->seg_bias = sum;
+}
+
+/**
+ * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
+ * @n: number of samples
+ * @calls: each sample's calls
+ * @bca: auxiliary data structure for holding temporary values
+ * @ref_base: the reference base
+ * @call: filled with the annotations
+ *
+ * Combines calls across the various samples being studied
+ * 1. For each allele at each base across all samples the quality is summed so
+ * you end up with a set of quality sums for each allele present 2. The quality
+ * sums are sorted.
+ * 3. Using the sorted quality sums we now create the allele ordering array
+ * A\subN. This is done by doing the following:
+ * a) If the reference allele is known it always comes first, otherwise N
+ * comes first.
+ * b) Then the rest of the alleles are output in descending order of quality
+ * sum (which we already know the qsum array was sorted). Any allelles with
+ * qsum 0 will be excluded.
+ * 4. Using the allele ordering array we create the genotype ordering array.
+ * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1
+ * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
+ * 5. The genotype ordering array is then used to extract data from the error
+ * model 5*5 matrix and is used to produce a Phread likelihood array for each
+ * sample.
+ */
+int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
+{
+ int ref4, i, j;
+ float qsum[5] = {0,0,0,0,0};
+ if (ref_base >= 0) {
+ call->ori_ref = ref4 = seq_nt16_int[ref_base];
+ if (ref4 > 4) ref4 = 4;
+ } else call->ori_ref = -1, ref4 = 0;
+
+ // calculate qsum, this is done by summing normalized qsum across all samples,
+ // to account for differences in coverage
+ for (i = 0; i < n; ++i)
+ {
+ float sum = 0;
+ for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
+ if ( sum )
+ for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
+ }
+
+ // sort qsum in ascending order (insertion sort)
+ float *ptr[5], *tmp;
+ for (i=0; i<5; i++) ptr[i] = &qsum[i];
+ for (i=1; i<4; i++)
+ for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
+ tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
+
+ // Set the reference allele and alternative allele(s)
+ for (i=0; i<5; i++) call->a[i] = -1;
+ for (i=0; i<5; i++) call->qsum[i] = 0;
+ call->unseen = -1;
+ call->a[0] = ref4;
+ for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
+ {
+ int ipos = ptr[i] - qsum; // position in sorted qsum array
+ if ( ipos==ref4 )
+ call->qsum[0] = qsum[ipos]; // REF's qsum
+ else
+ {
+ if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup
+ call->qsum[j] = qsum[ipos];
+ call->a[j++] = ipos;
+ }
+ }
+ if (ref_base >= 0)
+ {
+ // for SNPs, find the "unseen" base
+ if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
+ call->unseen = j, call->a[j++] = ptr[i] - qsum;
+ call->n_alleles = j;
+ }
+ else
+ {
+ call->n_alleles = j;
+ if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
+ }
+ /*
+ * Set the phread likelihood array (call->PL) This array is 15 entries long
+ * for each sample because that is size of an upper or lower triangle of a
+ * worst case 5x5 matrix of possible genotypes. This worst case matrix will
+ * occur when all 4 possible alleles are present and the reference allele
+ * is unknown. The sides of the matrix will correspond to the reference
+ * allele (if known) followed by the alleles present in descending order of
+ * quality sum
+ */
+ {
+ int x, g[15], z;
+ double sum_min = 0.;
+ x = call->n_alleles * (call->n_alleles + 1) / 2;
+ // get the possible genotypes
+ // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
+ for (i = z = 0; i < call->n_alleles; ++i) {
+ for (j = 0; j <= i; ++j) {
+ g[z++] = call->a[j] * 5 + call->a[i];
+ }
+ }
+ // for each sample calculate the PL
+ for (i = 0; i < n; ++i)
+ {
+ int32_t *PL = call->PL + x * i;
+ const bcf_callret1_t *r = calls + i;
+ float min = FLT_MAX;
+ for (j = 0; j < x; ++j) {
+ if (min > r->p[g[j]]) min = r->p[g[j]];
+ }
+ sum_min += min;
+ for (j = 0; j < x; ++j) {
+ int y;
+ y = (int)(r->p[g[j]] - min + .499);
+ if (y > 255) y = 255;
+ PL[j] = y;
+ }
+ }
+ if ( call->DP4 )
+ {
+ for (i=0; i<n; i++)
+ {
+ call->DP4[4*i] = calls[i].anno[0];
+ call->DP4[4*i+1] = calls[i].anno[1];
+ call->DP4[4*i+2] = calls[i].anno[2];
+ call->DP4[4*i+3] = calls[i].anno[3];
+ }
+ }
+ if ( call->ADF )
+ {
+ assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
+
+ // reorder ADR,ADF to match the allele ordering at this site
+ int32_t tmp[B2B_MAX_ALLELES];
+ int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
+ int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
+ int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
+ int32_t *adf_tot = call->ADF;
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<call->n_alleles; j++)
+ {
+ tmp[j] = adr[ call->a[j] ];
+ adr_tot[j] += tmp[j];
+ }
+ for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
+ for (j=0; j<call->n_alleles; j++)
+ {
+ tmp[j] = adf[ call->a[j] ];
+ adf_tot[j] += tmp[j];
+ }
+ for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
+ adf_out += call->n_alleles;
+ adr_out += call->n_alleles;
+ adr += B2B_MAX_ALLELES;
+ adf += B2B_MAX_ALLELES;
+ }
+ }
+
+// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+ call->shift = (int)(sum_min + .499);
+ }
+ // combine annotations
+ memset(call->anno, 0, 16 * sizeof(double));
+ call->ori_depth = 0;
+ call->depth = 0;
+ call->mq0 = 0;
+ for (i = 0; i < n; ++i) {
+ call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
+ call->ori_depth += calls[i].ori_depth;
+ call->mq0 += calls[i].mq0;
+ for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
+ }
+
+ calc_SegBias(calls, call);
+
+ // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
+ // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
+ // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
+
+ call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
+ call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual);
+ call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual);
+ call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+
+#if CDF_MWU_TESTS
+ call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
+ call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
+ call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
+ call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+#endif
+
+ call->vdb = calc_vdb(bca->alt_pos, bca->npos);
+
+ return 0;
+}
+
+int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ int i, j, nals = 1;
+
+ bcf_hdr_t *hdr = bc->bcf_hdr;
+ rec->rid = bc->tid;
+ rec->pos = bc->pos;
+ rec->qual = 0;
+
+ bc->tmp.l = 0;
+ if (bc->ori_ref < 0) // indel
+ {
+ // REF
+ kputc(ref[bc->pos], &bc->tmp);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
+
+ // ALT
+ for (i=1; i<4; i++)
+ {
+ if (bc->a[i] < 0) break;
+ kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
+
+ if (bca->indel_types[bc->a[i]] < 0) { // deletion
+ for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
+ kputc(ref[bc->pos+1+j], &bc->tmp);
+ } else { // insertion; cannot be a reference unless a bug
+ char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
+ for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
+ kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
+ }
+ nals++;
+ }
+ }
+ else // SNP
+ {
+ kputc("ACGTN"[bc->ori_ref], &bc->tmp);
+ for (i=1; i<5; i++)
+ {
+ if (bc->a[i] < 0) break;
+ kputc(',', &bc->tmp);
+ if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
+ else kputc("ACGT"[bc->a[i]], &bc->tmp);
+ nals++;
+ }
+ }
+ bcf_update_alleles_str(hdr, rec, bc->tmp.s);
+
+ bc->tmp.l = 0;
+
+ // INFO
+ if (bc->ori_ref < 0)
+ {
+ bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
+ bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
+ bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
+ }
+ bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
+ if ( fmt_flag&B2B_INFO_ADF )
+ bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_ADR )
+ bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
+ if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
+ {
+ for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
+ if ( fmt_flag&B2B_INFO_AD )
+ bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_DPR )
+ bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
+ }
+
+ float tmpf[16];
+ for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
+ bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
+ bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
+
+ if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+ if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+#if CDF_MWU_TESTS
+ if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
+ if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
+ if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
+ if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
+#endif
+ tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
+ bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
+
+ // FORMAT
+ rec->n_sample = bc->n;
+ bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
+ if ( fmt_flag&B2B_FMT_DP )
+ {
+ int32_t *ptr = (int32_t*) bc->fmt_arr;
+ for (i=0; i<bc->n; i++)
+ ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
+ bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
+ }
+ if ( fmt_flag&B2B_FMT_DV )
+ {
+ int32_t *ptr = (int32_t*) bc->fmt_arr;
+ for (i=0; i<bc->n; i++)
+ ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
+ bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
+ }
+ if ( fmt_flag&B2B_FMT_SP )
+ {
+ int32_t *ptr = (int32_t*) bc->fmt_arr;
+ for (i=0; i<bc->n; i++)
+ {
+ int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
+ if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
+ ptr[i] = 0;
+ else
+ {
+ double left, right, two;
+ kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
+ int32_t x = (int)(-4.343 * log(two) + .499);
+ if (x > 255) x = 255;
+ ptr[i] = x;
+ }
+ }
+ bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
+ }
+ if ( fmt_flag&B2B_FMT_DP4 )
+ bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
+ if ( fmt_flag&B2B_FMT_ADF )
+ bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_ADR )
+ bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
+ {
+ for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
+ if ( fmt_flag&B2B_FMT_AD )
+ bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_DPR )
+ bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ }
+
+ return 0;
+}
--- /dev/null
+#include "pysam.h"
+
+/* bam2bcf.c -- variant calling.
+
+ Copyright (C) 2010-2012 Broad Institute.
+ Copyright (C) 2012-2014 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <math.h>
+#include <stdint.h>
+#include <assert.h>
+#include <float.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/kstring.h>
+#include <htslib/kfunc.h>
+#include "bam2bcf.h"
+
+extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
+
+#define CALL_DEFTHETA 0.83
+#define DEF_MAPQ 20
+
+#define CAP_DIST 25
+
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+{
+ bcf_callaux_t *bca;
+ if (theta <= 0.) theta = CALL_DEFTHETA;
+ bca = (bcf_callaux_t*) calloc(1, sizeof(bcf_callaux_t));
+ bca->capQ = 60;
+ bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
+ bca->min_baseQ = min_baseQ;
+ bca->e = errmod_init(1. - theta);
+ bca->min_frac = 0.002;
+ bca->min_support = 1;
+ bca->per_sample_flt = 0;
+ bca->npos = 100;
+ bca->ref_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->alt_pos = (int*) malloc(bca->npos*sizeof(int));
+ bca->nqual = 60;
+ bca->ref_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_mq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->ref_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->alt_bq = (int*) malloc(bca->nqual*sizeof(int));
+ bca->fwd_mqs = (int*) malloc(bca->nqual*sizeof(int));
+ bca->rev_mqs = (int*) malloc(bca->nqual*sizeof(int));
+ return bca;
+}
+
+void bcf_call_destroy(bcf_callaux_t *bca)
+{
+ if (bca == 0) return;
+ errmod_destroy(bca->e);
+ if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; }
+ free(bca->ref_mq); free(bca->alt_mq); free(bca->ref_bq); free(bca->alt_bq);
+ free(bca->fwd_mqs); free(bca->rev_mqs);
+ bca->nqual = 0;
+ free(bca->bases); free(bca->inscns); free(bca);
+}
+
+// position in the sequence with respect to the aligned part of the read
+static int get_position(const bam_pileup1_t *p, int *len)
+{
+ int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1;
+ for (icig=0; icig<p->b->core.n_cigar; icig++)
+ {
+ int cig = bam_get_cigar(p->b)[icig] & BAM_CIGAR_MASK;
+ int ncig = bam_get_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT;
+ if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
+ {
+ n_tot_bases += ncig;
+ iread += ncig;
+ continue;
+ }
+ if ( cig==BAM_CINS )
+ {
+ n_tot_bases += ncig;
+ iread += ncig;
+ continue;
+ }
+ if ( cig==BAM_CSOFT_CLIP )
+ {
+ iread += ncig;
+ if ( iread<=p->qpos ) edist -= ncig;
+ continue;
+ }
+ if ( cig==BAM_CDEL ) continue;
+ if ( cig==BAM_CHARD_CLIP ) continue;
+ if ( cig==BAM_CPAD ) continue;
+ if ( cig==BAM_CREF_SKIP ) continue;
+ fprintf(pysam_stderr,"todo: cigar %d\n", cig);
+ assert(0);
+ }
+ *len = n_tot_bases;
+ return edist;
+}
+
+void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
+{
+ memset(bca->ref_pos,0,sizeof(int)*bca->npos);
+ memset(bca->alt_pos,0,sizeof(int)*bca->npos);
+ memset(bca->ref_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->alt_mq,0,sizeof(int)*bca->nqual);
+ memset(bca->ref_bq,0,sizeof(int)*bca->nqual);
+ memset(bca->alt_bq,0,sizeof(int)*bca->nqual);
+ memset(bca->fwd_mqs,0,sizeof(int)*bca->nqual);
+ memset(bca->rev_mqs,0,sizeof(int)*bca->nqual);
+ if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+ if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
+}
+
+/*
+ Notes:
+ - Called from bam_plcmd.c by mpileup. Amongst other things, sets the bcf_callret1_t.qsum frequencies
+ which are carried over via bcf_call_combine and bcf_call2bcf to the output BCF as the QS annotation.
+ Later it's used for multiallelic calling by bcftools -m
+ - ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
+ */
+/*
+ * This function is called once for each sample.
+ * _n is number of pilesups pl contributing reads to this sample
+ * pl is pointer to array of _n pileups (one pileup per read)
+ * ref_base is the 4-bit representation of the reference base. It is negative if we are looking at an indel.
+ * bca is the settings to perform calls across all samples
+ * r is the returned value of the call
+ */
+int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
+{
+ int i, n, ref4, is_indel, ori_depth = 0;
+
+ // clean from previous run
+ r->ori_depth = 0;
+ r->mq0 = 0;
+ memset(r->qsum,0,sizeof(float)*4);
+ memset(r->anno,0,sizeof(double)*16);
+ memset(r->p,0,sizeof(float)*25);
+
+ if (ref_base >= 0) {
+ ref4 = seq_nt16_int[ref_base];
+ is_indel = 0;
+ } else ref4 = 4, is_indel = 1;
+ if (_n == 0) return -1;
+ // enlarge the bases array if necessary
+ if (bca->max_bases < _n) {
+ bca->max_bases = _n;
+ kroundup32(bca->max_bases);
+ bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
+ }
+ // fill the bases array
+ for (i = n = 0; i < _n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+ // set base
+ if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+ ++ori_depth;
+ mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
+ if ( !mapQ ) r->mq0++;
+ baseQ = q = is_indel? p->aux&0xff : (int)bam_get_qual(p->b)[p->qpos]; // base/indel quality
+ seqQ = is_indel? (p->aux>>8&0xff) : 99;
+ if (q < bca->min_baseQ) continue;
+ if (q > seqQ) q = seqQ;
+ mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
+ if (q > mapQ) q = mapQ;
+ if (q > 63) q = 63;
+ if (q < 4) q = 4; // MQ=0 reads count as BQ=4
+ if (!is_indel) {
+ b = bam_seqi(bam_get_seq(p->b), p->qpos); // base
+ b = seq_nt16_int[b? b : ref_base]; // b is the 2-bit base
+ is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+ } else {
+ b = p->aux>>16&0x3f;
+ is_diff = (b != 0);
+ }
+ bca->bases[n++] = q<<5 | (int)bam_is_rev(p->b)<<4 | b;
+ // collect annotations
+ if (b < 4)
+ {
+ r->qsum[b] += q;
+ if ( r->ADF )
+ {
+ if ( bam_is_rev(p->b) )
+ r->ADR[b]++;
+ else
+ r->ADF[b]++;
+ }
+ }
+ ++r->anno[0<<2|is_diff<<1|bam_is_rev(p->b)];
+ min_dist = p->b->core.l_qseq - 1 - p->qpos;
+ if (min_dist > p->qpos) min_dist = p->qpos;
+ if (min_dist > CAP_DIST) min_dist = CAP_DIST;
+ r->anno[1<<2|is_diff<<1|0] += baseQ;
+ r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
+ r->anno[2<<2|is_diff<<1|0] += mapQ;
+ r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
+ r->anno[3<<2|is_diff<<1|0] += min_dist;
+ r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
+
+ // collect for bias tests
+ if ( baseQ > 59 ) baseQ = 59;
+ if ( mapQ > 59 ) mapQ = 59;
+ int len, pos = get_position(p, &len);
+ int epos = (double)pos/(len+1) * bca->npos;
+ int ibq = baseQ/60. * bca->nqual;
+ int imq = mapQ/60. * bca->nqual;
+ if ( bam_is_rev(p->b) ) bca->rev_mqs[imq]++;
+ else bca->fwd_mqs[imq]++;
+ if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
+ {
+ bca->ref_pos[epos]++;
+ bca->ref_bq[ibq]++;
+ bca->ref_mq[imq]++;
+ }
+ else
+ {
+ bca->alt_pos[epos]++;
+ bca->alt_bq[ibq]++;
+ bca->alt_mq[imq]++;
+ }
+ }
+ r->ori_depth = ori_depth;
+ // glfgen
+ errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype
+ return n;
+}
+
+
+/*
+ * calc_vdb() - returns value between zero (most biased) and one (no bias)
+ * on success, or HUGE_VAL when VDB cannot be calculated because
+ * of insufficient depth (<2x)
+ *
+ * Variant Distance Bias tests if the variant bases are positioned within the
+ * reads with sufficient randomness. Unlike other tests, it looks only at
+ * variant reads and therefore gives different kind of information than Read
+ * Position Bias for instance. VDB was developed for detecting artefacts in
+ * RNA-seq calls where reads from spliced transcripts span splice site
+ * boundaries. The current implementation differs somewhat from the original
+ * version described in supplementary material of PMID:22524474, but the idea
+ * remains the same. (Here the random variable tested is the average distance
+ * from the averaged position, not the average pairwise distance.)
+ *
+ * For coverage of 2x, the calculation is exact but is approximated for the
+ * rest. The result is most accurate between 4-200x. For 3x or >200x, the
+ * reported values are slightly more favourable than those of a true random
+ * distribution.
+ */
+double calc_vdb(int *pos, int npos)
+{
+ // Note well: the parameters were obtained by fitting to simulated data of
+ // 100bp reads. This assumes rescaling to 100bp in bcf_call_glfgen().
+ const int readlen = 100;
+ assert( npos==readlen );
+
+ #define nparam 15
+ const float param[nparam][3] = { {3,0.079,18}, {4,0.09,19.8}, {5,0.1,20.5}, {6,0.11,21.5},
+ {7,0.125,21.6}, {8,0.135,22}, {9,0.14,22.2}, {10,0.153,22.3}, {15,0.19,22.8},
+ {20,0.22,23.2}, {30,0.26,23.4}, {40,0.29,23.5}, {50,0.35,23.65}, {100,0.5,23.7},
+ {200,0.7,23.7} };
+
+ int i, dp = 0;
+ float mean_pos = 0, mean_diff = 0;
+ for (i=0; i<npos; i++)
+ {
+ if ( !pos[i] ) continue;
+ dp += pos[i];
+ mean_pos += pos[i]*i;
+ }
+ if ( dp<2 ) return HUGE_VAL; // one or zero reads can be placed anywhere
+
+ mean_pos /= dp;
+ for (i=0; i<npos; i++)
+ {
+ if ( !pos[i] ) continue;
+ mean_diff += pos[i] * fabs(i - mean_pos);
+ }
+ mean_diff /= dp;
+
+ int ipos = mean_diff; // tuned for float-to-int implicit conversion
+ if ( dp==2 )
+ return (2*readlen-2*(ipos+1)-1)*(ipos+1)/(readlen-1)/(readlen*0.5);
+
+ if ( dp>=200 )
+ i = nparam; // shortcut for big depths
+ else
+ {
+ for (i=0; i<nparam; i++)
+ if ( param[i][0]>=dp ) break;
+ }
+ float pshift, pscale;
+ if ( i==nparam )
+ {
+ // the depth is too high, go with 200x
+ pscale = param[nparam-1][1];
+ pshift = param[nparam-1][2];
+ }
+ else if ( i>0 && param[i][0]!=dp )
+ {
+ // linear interpolation of parameters
+ pscale = (param[i-1][1] + param[i][1])*0.5;
+ pshift = (param[i-1][2] + param[i][2])*0.5;
+ }
+ else
+ {
+ pscale = param[i][1];
+ pshift = param[i][2];
+ }
+ return 0.5*kf_erfc(-(mean_diff-pshift)*pscale);
+}
+
+double calc_chisq_bias(int *a, int *b, int n)
+{
+ int na = 0, nb = 0, i, ndf = n;
+ for (i=0; i<n; i++) na += a[i];
+ for (i=0; i<n; i++) nb += b[i];
+ if ( !na || !nb ) return HUGE_VAL;
+
+ double chisq = 0;
+ for (i=0; i<n; i++)
+ {
+ if ( !a[i] && !b[i] ) ndf--;
+ else
+ {
+ double tmp = a[i] - b[i];
+ chisq += tmp*tmp/(a[i]+b[i]);
+ }
+ }
+ /*
+ kf_gammq: incomplete gamma function Q(a,x) = 1 - P(a,x) = Gamma(a,x)/Gamma(a)
+ 1 if the distributions are identical, 0 if very different
+ */
+ double prob = kf_gammaq(0.5*ndf, 0.5*chisq);
+ return prob;
+}
+
+static double mann_whitney_1947_(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947_(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947_(n,m-1,U);
+}
+
+double mann_whitney_1947(int n, int m, int U)
+{
+ #include "mw.h"
+
+ assert(n >= 2 && m >= 2);
+
+ return (n < 8 && m < 8 && U < 50)
+ ? mw[n-2][m-2][U]
+ : mann_whitney_1947_(n,m,U);
+}
+
+double mann_whitney_1947_cdf(int n, int m, int U)
+{
+ int i;
+ double sum = 0;
+ for (i=0; i<=U; i++)
+ sum += mann_whitney_1947(n,m,i);
+ return sum;
+}
+
+double calc_mwu_bias_cdf(int *a, int *b, int n)
+{
+ int na = 0, nb = 0, i;
+ double U = 0, ties = 0;
+ for (i=0; i<n; i++)
+ {
+ na += a[i];
+ U += a[i] * (nb + b[i]*0.5);
+ nb += b[i];
+ if ( a[i] && b[i] )
+ {
+ double tie = a[i] + b[i];
+ ties += (tie*tie-1)*tie;
+ }
+ }
+ if ( !na || !nb ) return HUGE_VAL;
+
+ // Always work with the smaller U
+ double U_min = ((double)na * nb) - U;
+ if ( U < U_min ) U_min = U;
+
+ if ( na==1 ) return 2.0 * (floor(U_min)+1) / (nb+1);
+ if ( nb==1 ) return 2.0 * (floor(U_min)+1) / (na+1);
+
+ // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
+ if ( na>=8 || nb>=8 )
+ {
+ double mean = ((double)na*nb)*0.5;
+ // Correction for ties:
+ // double N = na+nb;
+ // double var2 = (N*N-1)*N-ties;
+ // if ( var2==0 ) return 1.0;
+ // var2 *= ((double)na*nb)/N/(N-1)/12.0;
+ // No correction for ties:
+ double var2 = ((double)na*nb)*(na+nb+1)/12.0;
+ double z = (U_min - mean)/sqrt(2*var2); // z is N(0,1)
+ return 2.0 - kf_erfc(z); // which is 1 + erf(z)
+ }
+
+ // Exact calculation
+ double pval = 2*mann_whitney_1947_cdf(na,nb,U_min);
+ return pval>1 ? 1 : pval;
+}
+
+double calc_mwu_bias(int *a, int *b, int n)
+{
+ int na = 0, nb = 0, i;
+ double U = 0, ties = 0;
+ for (i=0; i<n; i++)
+ {
+ if (!a[i]) {
+ if (!b[i]) continue;
+ nb += b[i];
+ } else if (!b[i]) {
+ na += a[i];
+ U += a[i] * nb;
+ } else {
+ na += a[i];
+ U += a[i] * (nb + b[i]*0.5);
+ nb += b[i];
+ double tie = a[i] + b[i];
+ ties += (tie*tie-1)*tie;
+ }
+ }
+ if ( !na || !nb ) return HUGE_VAL;
+ if ( na==1 || nb==1 ) return 1.0; // Flat probability, all U values are equally likely
+
+ double mean = ((double)na*nb)*0.5;
+ if ( na==2 || nb==2 )
+ {
+ // Linear approximation
+ return U>mean ? (2.0*mean-U)/mean : U/mean;
+ }
+ // Correction for ties:
+ // double N = na+nb;
+ // double var2 = (N*N-1)*N-ties;
+ // if ( var2==0 ) return 1.0;
+ // var2 *= ((double)na*nb)/N/(N-1)/12.0;
+ // No correction for ties:
+ double var2 = ((double)na*nb)*(na+nb+1)/12.0;
+ if ( na>=8 || nb>=8 )
+ {
+ // Normal approximation, very good for na>=8 && nb>=8 and reasonable if na<8 or nb<8
+ return exp(-0.5*(U-mean)*(U-mean)/var2);
+ }
+
+ // Exact calculation
+ return mann_whitney_1947(na,nb,U) * sqrt(2*M_PI*var2);
+}
+
+static inline double logsumexp2(double a, double b)
+{
+ if ( a>b )
+ return log(1 + exp(b-a)) + a;
+ else
+ return log(1 + exp(a-b)) + b;
+}
+
+void calc_SegBias(const bcf_callret1_t *bcr, bcf_call_t *call)
+{
+ call->seg_bias = HUGE_VAL;
+ if ( !bcr ) return;
+
+ int nr = call->anno[2] + call->anno[3]; // number of observed non-reference reads
+ if ( !nr ) return;
+
+ int avg_dp = (call->anno[0] + call->anno[1] + nr) / call->n; // average depth
+ double M = floor((double)nr / avg_dp + 0.5); // an approximate number of variants samples in the population
+ if ( M>call->n ) M = call->n; // clamp M at the number of samples
+ else if ( M==0 ) M = 1;
+ double f = M / 2. / call->n; // allele frequency
+ double p = (double) nr / call->n; // number of variant reads per sample expected if variant not real (poisson)
+ double q = (double) nr / M; // number of variant reads per sample expected if variant is real (poisson)
+ double sum = 0;
+ const double log2 = log(2.0);
+
+ // fprintf(pysam_stderr,"M=%.1f p=%e q=%e f=%f dp=%d\n",M,p,q,f,avg_dp);
+ int i;
+ for (i=0; i<call->n; i++)
+ {
+ int oi = bcr[i].anno[2] + bcr[i].anno[3]; // observed number of non-ref reads
+ double tmp;
+ if ( oi )
+ {
+ // tmp = log(f) + oi*log(q/p) - q + log(2*(1-f) + f*pow(2,oi)*exp(-q)) + p; // this can under/overflow
+ tmp = logsumexp2(log(2*(1-f)), log(f) + oi*log2 - q);
+ tmp += log(f) + oi*log(q/p) - q + p;
+ }
+ else
+ tmp = log(2*f*(1-f)*exp(-q) + f*f*exp(-2*q) + (1-f)*(1-f)) + p;
+ sum += tmp;
+ // fprintf(pysam_stderr,"oi=%d %e\n", oi,tmp);
+ }
+ call->seg_bias = sum;
+}
+
+/**
+ * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles
+ * @n: number of samples
+ * @calls: each sample's calls
+ * @bca: auxiliary data structure for holding temporary values
+ * @ref_base: the reference base
+ * @call: filled with the annotations
+ *
+ * Combines calls across the various samples being studied
+ * 1. For each allele at each base across all samples the quality is summed so
+ * you end up with a set of quality sums for each allele present 2. The quality
+ * sums are sorted.
+ * 3. Using the sorted quality sums we now create the allele ordering array
+ * A\subN. This is done by doing the following:
+ * a) If the reference allele is known it always comes first, otherwise N
+ * comes first.
+ * b) Then the rest of the alleles are output in descending order of quality
+ * sum (which we already know the qsum array was sorted). Any allelles with
+ * qsum 0 will be excluded.
+ * 4. Using the allele ordering array we create the genotype ordering array.
+ * In the worst case with an unknown reference this will be: A0/A0 A1/A0 A1/A1
+ * A2/A0 A2/A1 A2/A2 A3/A0 A3/A1 A3/A2 A3/A3 A4/A0 A4/A1 A4/A2 A4/A3 A4/A4
+ * 5. The genotype ordering array is then used to extract data from the error
+ * model 5*5 matrix and is used to produce a Phread likelihood array for each
+ * sample.
+ */
+int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call)
+{
+ int ref4, i, j;
+ float qsum[5] = {0,0,0,0,0};
+ if (ref_base >= 0) {
+ call->ori_ref = ref4 = seq_nt16_int[ref_base];
+ if (ref4 > 4) ref4 = 4;
+ } else call->ori_ref = -1, ref4 = 0;
+
+ // calculate qsum, this is done by summing normalized qsum across all samples,
+ // to account for differences in coverage
+ for (i = 0; i < n; ++i)
+ {
+ float sum = 0;
+ for (j = 0; j < 4; ++j) sum += calls[i].qsum[j];
+ if ( sum )
+ for (j = 0; j < 4; j++) qsum[j] += calls[i].qsum[j] / sum;
+ }
+
+ // sort qsum in ascending order (insertion sort)
+ float *ptr[5], *tmp;
+ for (i=0; i<5; i++) ptr[i] = &qsum[i];
+ for (i=1; i<4; i++)
+ for (j=i; j>0 && *ptr[j] < *ptr[j-1]; j--)
+ tmp = ptr[j], ptr[j] = ptr[j-1], ptr[j-1] = tmp;
+
+ // Set the reference allele and alternative allele(s)
+ for (i=0; i<5; i++) call->a[i] = -1;
+ for (i=0; i<5; i++) call->qsum[i] = 0;
+ call->unseen = -1;
+ call->a[0] = ref4;
+ for (i=3, j=1; i>=0; i--) // i: alleles sorted by QS; j, a[j]: output allele ordering
+ {
+ int ipos = ptr[i] - qsum; // position in sorted qsum array
+ if ( ipos==ref4 )
+ call->qsum[0] = qsum[ipos]; // REF's qsum
+ else
+ {
+ if ( !qsum[ipos] ) break; // qsum is 0, this and consequent alleles are not seen in the pileup
+ call->qsum[j] = qsum[ipos];
+ call->a[j++] = ipos;
+ }
+ }
+ if (ref_base >= 0)
+ {
+ // for SNPs, find the "unseen" base
+ if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
+ call->unseen = j, call->a[j++] = ptr[i] - qsum;
+ call->n_alleles = j;
+ }
+ else
+ {
+ call->n_alleles = j;
+ if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
+ }
+ /*
+ * Set the phread likelihood array (call->PL) This array is 15 entries long
+ * for each sample because that is size of an upper or lower triangle of a
+ * worst case 5x5 matrix of possible genotypes. This worst case matrix will
+ * occur when all 4 possible alleles are present and the reference allele
+ * is unknown. The sides of the matrix will correspond to the reference
+ * allele (if known) followed by the alleles present in descending order of
+ * quality sum
+ */
+ {
+ int x, g[15], z;
+ double sum_min = 0.;
+ x = call->n_alleles * (call->n_alleles + 1) / 2;
+ // get the possible genotypes
+ // this is done by creating an ordered list of locations g for call (allele a, allele b) in the genotype likelihood matrix
+ for (i = z = 0; i < call->n_alleles; ++i) {
+ for (j = 0; j <= i; ++j) {
+ g[z++] = call->a[j] * 5 + call->a[i];
+ }
+ }
+ // for each sample calculate the PL
+ for (i = 0; i < n; ++i)
+ {
+ int32_t *PL = call->PL + x * i;
+ const bcf_callret1_t *r = calls + i;
+ float min = FLT_MAX;
+ for (j = 0; j < x; ++j) {
+ if (min > r->p[g[j]]) min = r->p[g[j]];
+ }
+ sum_min += min;
+ for (j = 0; j < x; ++j) {
+ int y;
+ y = (int)(r->p[g[j]] - min + .499);
+ if (y > 255) y = 255;
+ PL[j] = y;
+ }
+ }
+ if ( call->DP4 )
+ {
+ for (i=0; i<n; i++)
+ {
+ call->DP4[4*i] = calls[i].anno[0];
+ call->DP4[4*i+1] = calls[i].anno[1];
+ call->DP4[4*i+2] = calls[i].anno[2];
+ call->DP4[4*i+3] = calls[i].anno[3];
+ }
+ }
+ if ( call->ADF )
+ {
+ assert( call->n_alleles<=B2B_MAX_ALLELES ); // this is always true for SNPs and so far for indels as well
+
+ // reorder ADR,ADF to match the allele ordering at this site
+ int32_t tmp[B2B_MAX_ALLELES];
+ int32_t *adr = call->ADR + B2B_MAX_ALLELES, *adr_out = call->ADR + B2B_MAX_ALLELES;
+ int32_t *adf = call->ADF + B2B_MAX_ALLELES, *adf_out = call->ADF + B2B_MAX_ALLELES;
+ int32_t *adr_tot = call->ADR; // the first bin stores total counts per site
+ int32_t *adf_tot = call->ADF;
+ for (i=0; i<n; i++)
+ {
+ for (j=0; j<call->n_alleles; j++)
+ {
+ tmp[j] = adr[ call->a[j] ];
+ adr_tot[j] += tmp[j];
+ }
+ for (j=0; j<call->n_alleles; j++) adr_out[j] = tmp[j];
+ for (j=0; j<call->n_alleles; j++)
+ {
+ tmp[j] = adf[ call->a[j] ];
+ adf_tot[j] += tmp[j];
+ }
+ for (j=0; j<call->n_alleles; j++) adf_out[j] = tmp[j];
+ adf_out += call->n_alleles;
+ adr_out += call->n_alleles;
+ adr += B2B_MAX_ALLELES;
+ adf += B2B_MAX_ALLELES;
+ }
+ }
+
+// if (ref_base < 0) fprintf(pysam_stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+ call->shift = (int)(sum_min + .499);
+ }
+ // combine annotations
+ memset(call->anno, 0, 16 * sizeof(double));
+ call->ori_depth = 0;
+ call->depth = 0;
+ call->mq0 = 0;
+ for (i = 0; i < n; ++i) {
+ call->depth += calls[i].anno[0] + calls[i].anno[1] + calls[i].anno[2] + calls[i].anno[3];
+ call->ori_depth += calls[i].ori_depth;
+ call->mq0 += calls[i].mq0;
+ for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
+ }
+
+ calc_SegBias(calls, call);
+
+ // calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
+ // calc_chisq_bias("XMQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_mq, bca->alt_mq, bca->nqual);
+ // calc_chisq_bias("XBQ", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_bq, bca->alt_bq, bca->nqual);
+
+ call->mwu_pos = calc_mwu_bias(bca->ref_pos, bca->alt_pos, bca->npos);
+ call->mwu_mq = calc_mwu_bias(bca->ref_mq, bca->alt_mq, bca->nqual);
+ call->mwu_bq = calc_mwu_bias(bca->ref_bq, bca->alt_bq, bca->nqual);
+ call->mwu_mqs = calc_mwu_bias(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+
+#if CDF_MWU_TESTS
+ call->mwu_pos_cdf = calc_mwu_bias_cdf(bca->ref_pos, bca->alt_pos, bca->npos);
+ call->mwu_mq_cdf = calc_mwu_bias_cdf(bca->ref_mq, bca->alt_mq, bca->nqual);
+ call->mwu_bq_cdf = calc_mwu_bias_cdf(bca->ref_bq, bca->alt_bq, bca->nqual);
+ call->mwu_mqs_cdf = calc_mwu_bias_cdf(bca->fwd_mqs, bca->rev_mqs, bca->nqual);
+#endif
+
+ call->vdb = calc_vdb(bca->alt_pos, bca->npos);
+
+ return 0;
+}
+
+int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ int i, j, nals = 1;
+
+ bcf_hdr_t *hdr = bc->bcf_hdr;
+ rec->rid = bc->tid;
+ rec->pos = bc->pos;
+ rec->qual = 0;
+
+ bc->tmp.l = 0;
+ if (bc->ori_ref < 0) // indel
+ {
+ // REF
+ kputc(ref[bc->pos], &bc->tmp);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
+
+ // ALT
+ for (i=1; i<4; i++)
+ {
+ if (bc->a[i] < 0) break;
+ kputc(',', &bc->tmp); kputc(ref[bc->pos], &bc->tmp);
+
+ if (bca->indel_types[bc->a[i]] < 0) { // deletion
+ for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
+ kputc(ref[bc->pos+1+j], &bc->tmp);
+ } else { // insertion; cannot be a reference unless a bug
+ char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
+ for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
+ kputc("ACGTN"[(int)inscns[j]], &bc->tmp);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
+ }
+ nals++;
+ }
+ }
+ else // SNP
+ {
+ kputc("ACGTN"[bc->ori_ref], &bc->tmp);
+ for (i=1; i<5; i++)
+ {
+ if (bc->a[i] < 0) break;
+ kputc(',', &bc->tmp);
+ if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
+ else kputc("ACGT"[bc->a[i]], &bc->tmp);
+ nals++;
+ }
+ }
+ bcf_update_alleles_str(hdr, rec, bc->tmp.s);
+
+ bc->tmp.l = 0;
+
+ // INFO
+ if (bc->ori_ref < 0)
+ {
+ bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
+ bcf_update_info_int32(hdr, rec, "IDV", &bca->max_support, 1);
+ bcf_update_info_float(hdr, rec, "IMF", &bca->max_frac, 1);
+ }
+ bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
+ if ( fmt_flag&B2B_INFO_ADF )
+ bcf_update_info_int32(hdr, rec, "ADF", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_ADR )
+ bcf_update_info_int32(hdr, rec, "ADR", bc->ADR, rec->n_allele);
+ if ( fmt_flag&(B2B_INFO_AD|B2B_INFO_DPR) )
+ {
+ for (i=0; i<rec->n_allele; i++) bc->ADF[i] += bc->ADR[i];
+ if ( fmt_flag&B2B_INFO_AD )
+ bcf_update_info_int32(hdr, rec, "AD", bc->ADF, rec->n_allele);
+ if ( fmt_flag&B2B_INFO_DPR )
+ bcf_update_info_int32(hdr, rec, "DPR", bc->ADF, rec->n_allele);
+ }
+
+ float tmpf[16];
+ for (i=0; i<16; i++) tmpf[i] = bc->anno[i];
+ bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
+ bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
+
+ if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
+ if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
+ if ( bc->mwu_pos != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
+ if ( bc->mwu_mq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
+ if ( bc->mwu_mqs != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
+ if ( bc->mwu_bq != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
+#if CDF_MWU_TESTS
+ if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
+ if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
+ if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
+ if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
+#endif
+ tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
+ bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
+
+ // FORMAT
+ rec->n_sample = bc->n;
+ bcf_update_format_int32(hdr, rec, "PL", bc->PL, nals*(nals+1)/2 * rec->n_sample);
+ if ( fmt_flag&B2B_FMT_DP )
+ {
+ int32_t *ptr = (int32_t*) bc->fmt_arr;
+ for (i=0; i<bc->n; i++)
+ ptr[i] = bc->DP4[4*i] + bc->DP4[4*i+1] + bc->DP4[4*i+2] + bc->DP4[4*i+3];
+ bcf_update_format_int32(hdr, rec, "DP", bc->fmt_arr, rec->n_sample);
+ }
+ if ( fmt_flag&B2B_FMT_DV )
+ {
+ int32_t *ptr = (int32_t*) bc->fmt_arr;
+ for (i=0; i<bc->n; i++)
+ ptr[i] = bc->DP4[4*i+2] + bc->DP4[4*i+3];
+ bcf_update_format_int32(hdr, rec, "DV", bc->fmt_arr, rec->n_sample);
+ }
+ if ( fmt_flag&B2B_FMT_SP )
+ {
+ int32_t *ptr = (int32_t*) bc->fmt_arr;
+ for (i=0; i<bc->n; i++)
+ {
+ int fwd_ref = bc->DP4[4*i], rev_ref = bc->DP4[4*i+1], fwd_alt = bc->DP4[4*i+2], rev_alt = bc->DP4[4*i+3];
+ if ( fwd_ref+rev_ref<2 || fwd_alt+rev_alt<2 || fwd_ref+fwd_alt<2 || rev_ref+rev_alt<2 )
+ ptr[i] = 0;
+ else
+ {
+ double left, right, two;
+ kt_fisher_exact(fwd_ref, rev_ref, fwd_alt, rev_alt, &left, &right, &two);
+ int32_t x = (int)(-4.343 * log(two) + .499);
+ if (x > 255) x = 255;
+ ptr[i] = x;
+ }
+ }
+ bcf_update_format_int32(hdr, rec, "SP", bc->fmt_arr, rec->n_sample);
+ }
+ if ( fmt_flag&B2B_FMT_DP4 )
+ bcf_update_format_int32(hdr, rec, "DP4", bc->DP4, rec->n_sample*4);
+ if ( fmt_flag&B2B_FMT_ADF )
+ bcf_update_format_int32(hdr, rec, "ADF", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_ADR )
+ bcf_update_format_int32(hdr, rec, "ADR", bc->ADR+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&(B2B_FMT_AD|B2B_FMT_DPR) )
+ {
+ for (i=0; i<rec->n_sample*rec->n_allele; i++) bc->ADF[B2B_MAX_ALLELES+i] += bc->ADR[B2B_MAX_ALLELES+i];
+ if ( fmt_flag&B2B_FMT_AD )
+ bcf_update_format_int32(hdr, rec, "AD", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ if ( fmt_flag&B2B_FMT_DPR )
+ bcf_update_format_int32(hdr, rec, "DPR", bc->ADF+B2B_MAX_ALLELES, rec->n_sample*rec->n_allele);
+ }
+
+ return 0;
+}
--- /dev/null
+/* bam2bcf.h -- variant calling.
+
+ Copyright (C) 2010-2012 Broad Institute.
+ Copyright (C) 2012-2014,2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef BAM2BCF_H
+#define BAM2BCF_H
+
+#include <stdint.h>
+#include <htslib/hts.h>
+#include <htslib/vcf.h>
+
+/**
+ * A simplified version of Mann-Whitney U-test is calculated
+ * by default (no CDF) because it is faster and seems to work
+ * better in machine learning filtering. When enabled by setting
+ * CDF_MWU_TESTS, additional annotations will appear on mpileup's
+ * output (RPB2 in addition to RPB, etc.).
+ */
+#ifndef CDF_MWU_TESTS
+#define CDF_MWU_TESTS 0
+#endif
+
+#define B2B_INDEL_NULL 10000
+
+#define B2B_FMT_DP (1<<0)
+#define B2B_FMT_SP (1<<1)
+#define B2B_FMT_DV (1<<2)
+#define B2B_FMT_DP4 (1<<3)
+#define B2B_FMT_DPR (1<<4)
+#define B2B_INFO_DPR (1<<5)
+#define B2B_FMT_AD (1<<6)
+#define B2B_FMT_ADF (1<<7)
+#define B2B_FMT_ADR (1<<8)
+#define B2B_INFO_AD (1<<9)
+#define B2B_INFO_ADF (1<<10)
+#define B2B_INFO_ADR (1<<11)
+
+#define B2B_MAX_ALLELES 5
+
+typedef struct __bcf_callaux_t {
+ int capQ, min_baseQ;
+ int openQ, extQ, tandemQ; // for indels
+ uint32_t min_support, max_support; // for collecting indel candidates
+ double min_frac; // for collecting indel candidates
+ float max_frac; // for collecting indel candidates
+ int per_sample_flt; // indel filtering strategy
+ int *ref_pos, *alt_pos, npos, *ref_mq, *alt_mq, *ref_bq, *alt_bq, *fwd_mqs, *rev_mqs, nqual; // for bias tests
+ // for internal uses
+ int max_bases;
+ int indel_types[4]; // indel lengths
+ int maxins, indelreg;
+ int read_len;
+ char *inscns;
+ uint16_t *bases; // 5bit: unused, 6:quality, 1:is_rev, 4:2-bit base or indel allele (index to bcf_callaux_t.indel_types)
+ errmod_t *e;
+ void *rghash;
+} bcf_callaux_t;
+
+typedef struct {
+ uint32_t ori_depth;
+ unsigned int mq0;
+ int32_t *ADF, *ADR;
+ float qsum[4];
+ // The fields are:
+ // depth fwd .. ref (0) and non-ref (2)
+ // depth rev .. ref (1) and non-ref (3)
+ // baseQ .. ref (4) and non-ref (6)
+ // baseQ^2 .. ref (5) and non-ref (7)
+ // mapQ .. ref (8) and non-ref (10)
+ // mapQ^2 .. ref (9) and non-ref (11)
+ // minDist .. ref (12) and non-ref (14)
+ // minDist^2 .. ref (13) and non-ref (15)
+ // Note that this probably needs a more thorough fix: int types in
+ // bcf_call_t do overflow with high-coverage data, such as exomes, and
+ // BCFv2 supports only floats which may not suffice.
+ double anno[16];
+ float p[25]; // phred-scaled likelihood of each genotype
+} bcf_callret1_t;
+
+typedef struct {
+ int tid, pos;
+ bcf_hdr_t *bcf_hdr;
+ int a[5]; // alleles: ref, alt, alt2, alt3
+ float qsum[5]; // for the QS tag
+ int n, n_alleles, shift, ori_ref, unseen;
+ int n_supp; // number of supporting non-reference reads
+ double anno[16];
+ unsigned int depth, ori_depth, mq0;
+ int32_t *PL, *DP4, *ADR, *ADF;
+ uint8_t *fmt_arr;
+ float vdb; // variant distance bias
+ float mwu_pos, mwu_mq, mwu_bq, mwu_mqs;
+#if CDF_MWU_TESTS
+ float mwu_pos_cdf, mwu_mq_cdf, mwu_bq_cdf, mwu_mqs_cdf;
+#endif
+ float seg_bias;
+ kstring_t tmp;
+} bcf_call_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
+ void bcf_call_destroy(bcf_callaux_t *bca);
+ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
+ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call);
+ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag,
+ const bcf_callaux_t *bca, const char *ref);
+ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref);
+ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/* bam2bcf_indel.c -- indel caller.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2012-2014,2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
+#include "bam2bcf.h"
+
+#include <htslib/ksort.h>
+KSORT_INIT_GENERIC(uint32_t)
+
+#define MINUS_CONST 0x10000000
+#define INDEL_WINDOW_SIZE 50
+
+static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+{
+ int k, x = c->pos, y = 0, last_y = 0;
+ *_tpos = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int l = cigar[k] >> BAM_CIGAR_SHIFT;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (c->pos > tpos) return y;
+ if (x + l > tpos) {
+ *_tpos = tpos;
+ return y + (tpos - x);
+ }
+ x += l; y += l;
+ last_y = y;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ if (x + l > tpos) {
+ *_tpos = is_left? x : x + l;
+ return y;
+ }
+ x += l;
+ }
+ }
+ *_tpos = x;
+ return last_y;
+}
+// FIXME: check if the inserted sequence is consistent with the homopolymer run
+// l is the relative gap length and l_run is the length of the homopolymer on the reference
+static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
+{
+ int q, qh;
+ q = bca->openQ + bca->extQ * (abs(l) - 1);
+ qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
+ return q < qh? q : qh;
+}
+
+static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
+{
+ int i, j, max = 0, max_i = pos, score = 0;
+ l = abs(l);
+ for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
+ if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
+ else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
+ if (score < 0) break;
+ if (max < score) max = score, max_i = i;
+ }
+ return max_i - pos;
+}
+
+/*
+ notes:
+ - n .. number of samples
+ - the routine sets bam_pileup1_t.aux of each read as follows:
+ - 6: unused
+ - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
+ - 8: estimated sequence quality .. (aux>>8)&0xff
+ - 8: indel quality .. aux&0xff
+ */
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+{
+ int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+ int N, K, l_run, ref_type, n_alt;
+ char *inscns = 0, *ref2, *query, **ref_sample;
+ if (ref == 0 || bca == 0) return -1;
+
+ // determine if there is a gap
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i)
+ if (plp[s][i].indel != 0) break;
+ if (i < n_plp[s]) break;
+ }
+ if (s == n) return -1; // there is no indel at this position.
+ for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
+ { // find out how many types of indels are present
+ bca->max_support = bca->max_frac = 0;
+ int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
+ uint32_t *aux;
+ aux = (uint32_t*) calloc(N + 1, 4);
+ m = max_rd_len = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type
+ for (s = 0; s < n; ++s) {
+ int na = 0, nt = 0;
+ for (i = 0; i < n_plp[s]; ++i) {
+ const bam_pileup1_t *p = plp[s] + i;
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+ j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
+ }
+ double frac = (double)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
+ indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
+ n_alt += na;
+ n_tot += nt;
+ }
+ // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
+ // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
+ int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
+ if ( nN*2>(i-pos) ) { free(aux); return -1; }
+
+ ks_introsort(uint32_t, m, aux);
+ // squeeze out identical types
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+ // Taking totals makes it hard to call rare indels
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux); return -1;
+ }
+ if (n_types >= 64) {
+ free(aux);
+ // TODO revisit how/whether to control printing this warning
+ if (hts_verbose >= 2)
+ fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
+ return -1;
+ }
+ types = (int*)calloc(n_types, sizeof(int));
+ t = 0;
+ types[t++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i)
+ if (aux[i] != aux[i-1])
+ types[t++] = aux[i] - MINUS_CONST;
+ free(aux);
+ for (t = 0; t < n_types; ++t)
+ if (types[t] == 0) break;
+ ref_type = t; // the index of the reference type (0)
+ }
+ { // calculate left and right boundary
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
+ if (types[0] < 0) right -= types[0];
+ // in case the alignments stand out the reference
+ for (i = pos; i < right; ++i)
+ if (ref[i] == 0) break;
+ right = i;
+ }
+ /* The following block fixes a long-existing flaw in the INDEL
+ * calling model: the interference of nearby SNPs. However, it also
+ * reduces the power because sometimes, substitutions caused by
+ * indels are not distinguishable from true mutations. Multiple
+ * sequence realignment helps to increase the power.
+ *
+ * Masks mismatches present in at least 70% of the reads with 'N'.
+ */
+ { // construct per-sample consensus
+ int L = right - left + 1, max_i, max2_i;
+ uint32_t *cns, max, max2;
+ char *ref0, *r;
+ ref_sample = (char**) calloc(n, sizeof(char*));
+ cns = (uint32_t*) calloc(L, 4);
+ ref0 = (char*) calloc(L, 1);
+ for (i = 0; i < right - left; ++i)
+ ref0[i] = seq_nt16_table[(int)ref[i+left]];
+ for (s = 0; s < n; ++s) {
+ r = ref_sample[s] = (char*) calloc(L, 1);
+ memset(cns, 0, sizeof(int) * L);
+ // collect ref and non-ref counts
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ bam1_t *b = p->b;
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ int x = b->core.pos, y = 0;
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ int j, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j)
+ if (x + j >= left && x + j < right)
+ cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ }
+ }
+ // determine the consensus
+ for (i = 0; i < right - left; ++i) r[i] = ref0[i];
+ max = max2 = 0; max_i = max2_i = -1;
+ for (i = 0; i < right - left; ++i) {
+ if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+ else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
+ }
+ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
+ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
+ if (max_i >= 0) r[max_i] = 15;
+ if (max2_i >= 0) r[max2_i] = 15;
+ //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr);
+ }
+ free(ref0); free(cns);
+ }
+ { // the length of the homopolymer run around the current position
+ int c = seq_nt16_table[(int)ref[pos + 1]];
+ if (c == 15) l_run = 1;
+ else {
+ for (i = pos + 2; ref[i]; ++i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run = i;
+ for (i = pos; i >= 0; --i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run -= i + 1;
+ }
+ }
+ // construct the consensus sequence
+ max_ins = types[n_types - 1]; // max_ins is at least 0
+ if (max_ins > 0) {
+ int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
+ // count the number of occurrences of each base at each position for each type of insertion
+ for (t = 0; t < n_types; ++t) {
+ if (types[t] > 0) {
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (p->indel == types[t]) {
+ uint8_t *seq = bam_get_seq(p->b);
+ for (k = 1; k <= p->indel; ++k) {
+ int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
+ assert(c<5);
+ ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+ }
+ }
+ }
+ }
+ }
+ }
+ // use the majority rule to construct the consensus
+ inscns = (char*) calloc(n_types * max_ins, 1);
+ for (t = 0; t < n_types; ++t) {
+ for (j = 0; j < types[t]; ++j) {
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+ for (k = 0; k < 5; ++k)
+ if (ia[k] > max)
+ max = ia[k], max_k = k;
+ inscns[t*max_ins + j] = max? max_k : 4;
+ if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
+ }
+ }
+ free(inscns_aux);
+ }
+ // compute the likelihood given each type of indel for each read
+ max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
+ ref2 = (char*) calloc(max_ref2, 1);
+ query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
+ score1 = (int*) calloc(N * n_types, sizeof(int));
+ score2 = (int*) calloc(N * n_types, sizeof(int));
+ bca->indelreg = 0;
+ for (t = 0; t < n_types; ++t) {
+ int l, ir;
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ apf1.bw = apf2.bw = abs(types[t]) + 3;
+ // compute indelreg
+ if (types[t] == 0) ir = 0;
+ else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+ else ir = est_indelreg(pos, ref, -types[t], 0);
+ if (ir > bca->indelreg) bca->indelreg = ir;
+// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
+ // realignment
+ for (s = K = 0; s < n; ++s) {
+ // write ref2
+ for (k = 0, j = left; j <= pos; ++j)
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
+ if (types[t] <= 0) j += -types[t];
+ else for (l = 0; l < types[t]; ++l)
+ ref2[k++] = inscns[t*max_ins + l];
+ for (; j < right && ref[j]; ++j)
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
+ for (; k < max_ref2; ++k) ref2[k] = 4;
+ if (j < right) right = j;
+ // align each read to ref2
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int qbeg, qend, tbeg, tend, sc, kk;
+ uint8_t *seq = bam_get_seq(p->b);
+ uint32_t *cigar = bam_get_cigar(p->b);
+ if (p->b->core.flag&4) continue; // unmapped reads
+ // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+ for (kk = 0; kk < p->b->core.n_cigar; ++kk)
+ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
+ if (kk < p->b->core.n_cigar) continue;
+ // FIXME: the following skips soft clips, but using them may be more sensitive.
+ // determine the start and end of sequences for alignment
+ qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
+ qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
+ if (types[t] < 0) {
+ int l = -types[t];
+ tbeg = tbeg - l > left? tbeg - l : left;
+ }
+ // write the query sequence
+ for (l = qbeg; l < qend; ++l)
+ query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
+ { // do realignment; this is the bottleneck
+ const uint8_t *qual = bam_get_qual(p->b), *bq;
+ uint8_t *qq;
+ qq = (uint8_t*) calloc(qend - qbeg, 1);
+ bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+ if (bq) ++bq; // skip type
+ for (l = qbeg; l < qend; ++l) {
+ qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
+ if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
+ if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
+ }
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
+ if (l > 255) l = 255;
+ score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
+ if (sc > 5) {
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499);
+ if (l > 255) l = 255;
+ score2[K*n_types + t] = sc<<8 | l;
+ }
+ free(qq);
+ }
+/*
+ for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
+ fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
+ fputc('\n', stderr);
+ for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
+ fputc('\n', stderr);
+ fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
+*/
+ }
+ }
+ }
+ free(ref2); free(query);
+ { // compute indelQ
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ /* errmod_cal() assumes that if the call is wrong, the
+ * likelihoods of other events are equal. This is about
+ * right for substitutions, but is not desired for
+ * indels. To reuse errmod_cal(), I have to make
+ * compromise for multi-allelic indels.
+ */
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ1 = (sc[1]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ1 = (sc[t]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
+ sct = &score2[K*n_types];
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ2 = (sc[1]>>14) - (sc[0]>>14);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ2 = (sc[t]>>14) - (sc[0]>>14);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
+ // pick the smaller between indelQ1 and indelQ2
+ indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
+ if (indelQ > 255) indelQ = 255;
+ if (seqQ > 255) seqQ = 255;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+ sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+ }
+ }
+ // determine bca->indel_types[] and bca->inscns
+ bca->maxins = max_ins;
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+ for (t = 0; t < n_types; ++t)
+ sumq[t] = sumq[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sumq[t]&0x3f) == ref_type) break;
+ if (t) { // then move the reference type to the first
+ tmp = sumq[t];
+ for (; t > 0; --t) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+ for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+ for (t = 0; t < 4 && t < n_types; ++t) {
+ bca->indel_types[t] = types[sumq[t]&0x3f];
+ memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+ }
+ // update p->aux
+ for (s = n_alt = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ int x = types[p->aux>>16&0x3f];
+ for (j = 0; j < 4; ++j)
+ if (x == bca->indel_types[j]) break;
+ p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+ if ((p->aux>>16&0x3f) > 0) ++n_alt;
+ //fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+ }
+ }
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
+ }
+ free(score1); free(score2);
+ // free
+ for (i = 0; i < n; ++i) free(ref_sample[i]);
+ free(ref_sample);
+ free(types); free(inscns);
+ return n_alt > 0? 0 : -1;
+}
--- /dev/null
+#include "pysam.h"
+
+/* bam2bcf_indel.c -- indel caller.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2012-2014,2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <htslib/hts.h>
+#include <htslib/sam.h>
+#include <htslib/khash_str2int.h>
+#include "bam2bcf.h"
+
+#include <htslib/ksort.h>
+KSORT_INIT_GENERIC(uint32_t)
+
+#define MINUS_CONST 0x10000000
+#define INDEL_WINDOW_SIZE 50
+
+static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+{
+ int k, x = c->pos, y = 0, last_y = 0;
+ *_tpos = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int l = cigar[k] >> BAM_CIGAR_SHIFT;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ if (c->pos > tpos) return y;
+ if (x + l > tpos) {
+ *_tpos = tpos;
+ return y + (tpos - x);
+ }
+ x += l; y += l;
+ last_y = y;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ if (x + l > tpos) {
+ *_tpos = is_left? x : x + l;
+ return y;
+ }
+ x += l;
+ }
+ }
+ *_tpos = x;
+ return last_y;
+}
+// FIXME: check if the inserted sequence is consistent with the homopolymer run
+// l is the relative gap length and l_run is the length of the homopolymer on the reference
+static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
+{
+ int q, qh;
+ q = bca->openQ + bca->extQ * (abs(l) - 1);
+ qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
+ return q < qh? q : qh;
+}
+
+static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
+{
+ int i, j, max = 0, max_i = pos, score = 0;
+ l = abs(l);
+ for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
+ if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
+ else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
+ if (score < 0) break;
+ if (max < score) max = score, max_i = i;
+ }
+ return max_i - pos;
+}
+
+/*
+ notes:
+ - n .. number of samples
+ - the routine sets bam_pileup1_t.aux of each read as follows:
+ - 6: unused
+ - 6: the call; index to bcf_callaux_t.indel_types .. (aux>>16)&0x3f
+ - 8: estimated sequence quality .. (aux>>8)&0xff
+ - 8: indel quality .. aux&0xff
+ */
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref)
+{
+ int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+ int N, K, l_run, ref_type, n_alt;
+ char *inscns = 0, *ref2, *query, **ref_sample;
+ if (ref == 0 || bca == 0) return -1;
+
+ // determine if there is a gap
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i)
+ if (plp[s][i].indel != 0) break;
+ if (i < n_plp[s]) break;
+ }
+ if (s == n) return -1; // there is no indel at this position.
+ for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
+ { // find out how many types of indels are present
+ bca->max_support = bca->max_frac = 0;
+ int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
+ uint32_t *aux;
+ aux = (uint32_t*) calloc(N + 1, 4);
+ m = max_rd_len = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type
+ for (s = 0; s < n; ++s) {
+ int na = 0, nt = 0;
+ for (i = 0; i < n_plp[s]; ++i) {
+ const bam_pileup1_t *p = plp[s] + i;
+ ++nt;
+ if (p->indel != 0) {
+ ++na;
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+ j = bam_cigar2qlen(p->b->core.n_cigar, bam_get_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
+ }
+ double frac = (double)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
+ indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
+ n_alt += na;
+ n_tot += nt;
+ }
+ // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
+ // check the number of N's in the sequence and skip places where half or more reference bases are Ns.
+ int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
+ if ( nN*2>(i-pos) ) { free(aux); return -1; }
+
+ ks_introsort(uint32_t, m, aux);
+ // squeeze out identical types
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+ // Taking totals makes it hard to call rare indels
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux); return -1;
+ }
+ if (n_types >= 64) {
+ free(aux);
+ // TODO revisit how/whether to control printing this warning
+ if (hts_verbose >= 2)
+ fprintf(pysam_stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1);
+ return -1;
+ }
+ types = (int*)calloc(n_types, sizeof(int));
+ t = 0;
+ types[t++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i)
+ if (aux[i] != aux[i-1])
+ types[t++] = aux[i] - MINUS_CONST;
+ free(aux);
+ for (t = 0; t < n_types; ++t)
+ if (types[t] == 0) break;
+ ref_type = t; // the index of the reference type (0)
+ }
+ { // calculate left and right boundary
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
+ if (types[0] < 0) right -= types[0];
+ // in case the alignments stand out the reference
+ for (i = pos; i < right; ++i)
+ if (ref[i] == 0) break;
+ right = i;
+ }
+ /* The following block fixes a long-existing flaw in the INDEL
+ * calling model: the interference of nearby SNPs. However, it also
+ * reduces the power because sometimes, substitutions caused by
+ * indels are not distinguishable from true mutations. Multiple
+ * sequence realignment helps to increase the power.
+ *
+ * Masks mismatches present in at least 70% of the reads with 'N'.
+ */
+ { // construct per-sample consensus
+ int L = right - left + 1, max_i, max2_i;
+ uint32_t *cns, max, max2;
+ char *ref0, *r;
+ ref_sample = (char**) calloc(n, sizeof(char*));
+ cns = (uint32_t*) calloc(L, 4);
+ ref0 = (char*) calloc(L, 1);
+ for (i = 0; i < right - left; ++i)
+ ref0[i] = seq_nt16_table[(int)ref[i+left]];
+ for (s = 0; s < n; ++s) {
+ r = ref_sample[s] = (char*) calloc(L, 1);
+ memset(cns, 0, sizeof(int) * L);
+ // collect ref and non-ref counts
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ bam1_t *b = p->b;
+ uint32_t *cigar = bam_get_cigar(b);
+ uint8_t *seq = bam_get_seq(b);
+ int x = b->core.pos, y = 0;
+ for (k = 0; k < b->core.n_cigar; ++k) {
+ int op = cigar[k]&0xf;
+ int j, l = cigar[k]>>4;
+ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+ for (j = 0; j < l; ++j)
+ if (x + j >= left && x + j < right)
+ cns[x+j-left] += (bam_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000;
+ x += l; y += l;
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ }
+ }
+ // determine the consensus
+ for (i = 0; i < right - left; ++i) r[i] = ref0[i];
+ max = max2 = 0; max_i = max2_i = -1;
+ for (i = 0; i < right - left; ++i) {
+ if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i;
+ else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i;
+ }
+ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1;
+ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1;
+ if (max_i >= 0) r[max_i] = 15;
+ if (max2_i >= 0) r[max2_i] = 15;
+ //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], pysam_stderr); fputc('\n', pysam_stderr);
+ }
+ free(ref0); free(cns);
+ }
+ { // the length of the homopolymer run around the current position
+ int c = seq_nt16_table[(int)ref[pos + 1]];
+ if (c == 15) l_run = 1;
+ else {
+ for (i = pos + 2; ref[i]; ++i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run = i;
+ for (i = pos; i >= 0; --i)
+ if (seq_nt16_table[(int)ref[i]] != c) break;
+ l_run -= i + 1;
+ }
+ }
+ // construct the consensus sequence
+ max_ins = types[n_types - 1]; // max_ins is at least 0
+ if (max_ins > 0) {
+ int *inscns_aux = (int*) calloc(5 * n_types * max_ins, sizeof(int));
+ // count the number of occurrences of each base at each position for each type of insertion
+ for (t = 0; t < n_types; ++t) {
+ if (types[t] > 0) {
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (p->indel == types[t]) {
+ uint8_t *seq = bam_get_seq(p->b);
+ for (k = 1; k <= p->indel; ++k) {
+ int c = seq_nt16_int[bam_seqi(seq, p->qpos + k)];
+ assert(c<5);
+ ++inscns_aux[(t*max_ins+(k-1))*5 + c];
+ }
+ }
+ }
+ }
+ }
+ }
+ // use the majority rule to construct the consensus
+ inscns = (char*) calloc(n_types * max_ins, 1);
+ for (t = 0; t < n_types; ++t) {
+ for (j = 0; j < types[t]; ++j) {
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5];
+ for (k = 0; k < 5; ++k)
+ if (ia[k] > max)
+ max = ia[k], max_k = k;
+ inscns[t*max_ins + j] = max? max_k : 4;
+ if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's
+ }
+ }
+ free(inscns_aux);
+ }
+ // compute the likelihood given each type of indel for each read
+ max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
+ ref2 = (char*) calloc(max_ref2, 1);
+ query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1);
+ score1 = (int*) calloc(N * n_types, sizeof(int));
+ score2 = (int*) calloc(N * n_types, sizeof(int));
+ bca->indelreg = 0;
+ for (t = 0; t < n_types; ++t) {
+ int l, ir;
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ apf1.bw = apf2.bw = abs(types[t]) + 3;
+ // compute indelreg
+ if (types[t] == 0) ir = 0;
+ else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+ else ir = est_indelreg(pos, ref, -types[t], 0);
+ if (ir > bca->indelreg) bca->indelreg = ir;
+// fprintf(pysam_stderr, "%d, %d, %d\n", pos, types[t], ir);
+ // realignment
+ for (s = K = 0; s < n; ++s) {
+ // write ref2
+ for (k = 0, j = left; j <= pos; ++j)
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
+ if (types[t] <= 0) j += -types[t];
+ else for (l = 0; l < types[t]; ++l)
+ ref2[k++] = inscns[t*max_ins + l];
+ for (; j < right && ref[j]; ++j)
+ ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]];
+ for (; k < max_ref2; ++k) ref2[k] = 4;
+ if (j < right) right = j;
+ // align each read to ref2
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int qbeg, qend, tbeg, tend, sc, kk;
+ uint8_t *seq = bam_get_seq(p->b);
+ uint32_t *cigar = bam_get_cigar(p->b);
+ if (p->b->core.flag&4) continue; // unmapped reads
+ // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway.
+ for (kk = 0; kk < p->b->core.n_cigar; ++kk)
+ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break;
+ if (kk < p->b->core.n_cigar) continue;
+ // FIXME: the following skips soft clips, but using them may be more sensitive.
+ // determine the start and end of sequences for alignment
+ qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left, 0, &tbeg);
+ qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right, 1, &tend);
+ if (types[t] < 0) {
+ int l = -types[t];
+ tbeg = tbeg - l > left? tbeg - l : left;
+ }
+ // write the query sequence
+ for (l = qbeg; l < qend; ++l)
+ query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)];
+ { // do realignment; this is the bottleneck
+ const uint8_t *qual = bam_get_qual(p->b), *bq;
+ uint8_t *qq;
+ qq = (uint8_t*) calloc(qend - qbeg, 1);
+ bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+ if (bq) ++bq; // skip type
+ for (l = qbeg; l < qend; ++l) {
+ qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
+ if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
+ if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
+ }
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
+ if (l > 255) l = 255;
+ score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
+ if (sc > 5) {
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499);
+ if (l > 255) l = 255;
+ score2[K*n_types + t] = sc<<8 | l;
+ }
+ free(qq);
+ }
+/*
+ for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
+ fputc("ACGTN"[(int)ref2[tbeg-left+l]], pysam_stderr);
+ fputc('\n', pysam_stderr);
+ for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], pysam_stderr);
+ fputc('\n', pysam_stderr);
+ fprintf(pysam_stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
+*/
+ }
+ }
+ }
+ free(ref2); free(query);
+ { // compute indelQ
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ /* errmod_cal() assumes that if the call is wrong, the
+ * likelihoods of other events are equal. This is about
+ * right for substitutions, but is not desired for
+ * indels. To reuse errmod_cal(), I have to make
+ * compromise for multi-allelic indels.
+ */
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ1 = (sc[1]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ1 = (sc[t]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
+ sct = &score2[K*n_types];
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ2 = (sc[1]>>14) - (sc[0]>>14);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ2 = (sc[t]>>14) - (sc[0]>>14);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
+ // pick the smaller between indelQ1 and indelQ2
+ indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
+ if (indelQ > 255) indelQ = 255;
+ if (seqQ > 255) seqQ = 255;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total
+ sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+// fprintf(pysam_stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ);
+ }
+ }
+ // determine bca->indel_types[] and bca->inscns
+ bca->maxins = max_ins;
+ bca->inscns = (char*) realloc(bca->inscns, bca->maxins * 4);
+ for (t = 0; t < n_types; ++t)
+ sumq[t] = sumq[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sumq[t]&0x3f) == ref_type) break;
+ if (t) { // then move the reference type to the first
+ tmp = sumq[t];
+ for (; t > 0; --t) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+ for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+ for (t = 0; t < 4 && t < n_types; ++t) {
+ bca->indel_types[t] = types[sumq[t]&0x3f];
+ memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+ }
+ // update p->aux
+ for (s = n_alt = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ int x = types[p->aux>>16&0x3f];
+ for (j = 0; j < 4; ++j)
+ if (x == bca->indel_types[j]) break;
+ p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+ if ((p->aux>>16&0x3f) > 0) ++n_alt;
+ //fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
+ }
+ }
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
+ }
+ free(score1); free(score2);
+ // free
+ for (i = 0; i < n; ++i) free(ref_sample[i]);
+ free(ref_sample);
+ free(types); free(inscns);
+ return n_alt > 0? 0 : -1;
+}
--- /dev/null
+/* bam_sample.c -- group data by sample.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <khash_str2str.h>
+#include "bam_sample.h"
+#include "bcftools.h"
+
+
+typedef struct
+{
+ char *fname;
+ void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup
+ int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample
+}
+file_t;
+
+struct _bam_smpl_t
+{
+ kstring_t tmp;
+ file_t *files;
+ int ignore_rg, nsmpl, nfiles;
+ char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup
+ void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list
+ int sample_logic; // the -s/-S logic, 1: include, 0: exclude
+ void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list
+ int rg_logic; // the -G logic, 1: include, 0: exclude
+ void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup
+};
+
+bam_smpl_t *bam_smpl_init(void)
+{
+ bam_smpl_t *bsmpl;
+ bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t));
+ bsmpl->name2idx = khash_str2int_init();
+ return bsmpl;
+}
+
+void bam_smpl_destroy(bam_smpl_t *bsmpl)
+{
+ if ( !bsmpl ) return;
+ if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx);
+ if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list);
+ if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list);
+ int i;
+ for (i=0; i<bsmpl->nfiles; i++)
+ {
+ file_t *file = &bsmpl->files[i];
+ if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
+ free(file->fname);
+ }
+ free(bsmpl->smpl);
+ free(bsmpl->files);
+ free(bsmpl->tmp.s);
+ free(bsmpl);
+}
+
+void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl)
+{
+ bsmpl->ignore_rg = 1;
+}
+
+static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
+{
+ int ismpl = -1;
+ if ( smpl_name )
+ {
+ if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
+ {
+ // new sample
+ bsmpl->nsmpl++;
+ bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
+ bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
+ ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
+ }
+ }
+ if ( !strcmp("*",rg_id) )
+ {
+ // all read groups in the bam treated as the same sample
+ file->default_idx = ismpl;
+ return;
+ }
+ if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
+ if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID
+ khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
+}
+static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
+{
+ char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only
+ if ( !rg_smpl )
+ {
+ // read group specific to this bam
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl )
+ {
+ // any read group in this file?
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl && bsmpl->rg_logic ) return 0;
+ if ( rg_smpl && !bsmpl->rg_logic ) return 0;
+
+ if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample
+ return 1;
+}
+
+/*
+ The logic of this function is a bit complicated because we want to work
+ also with broken bams containing read groups that are not listed in the
+ header. The desired behavior is as follows:
+ - when -G is given, read groups which are not listed in the header must
+ be given explicitly using the "?" symbol in -G.
+ Otherwise:
+ - if the bam has no header, all reads in the file are assigned to a
+ single sample named after the file
+ - if there is at least one sample defined in the header, reads with no
+ read group id or with a read group id not listed in the header are
+ assigned to the first sample encountered in the header
+*/
+int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
+{
+ bsmpl->nfiles++;
+ bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
+ file_t *file = &bsmpl->files[bsmpl->nfiles-1];
+ memset(file,0,sizeof(file_t));
+ file->fname = strdup(fname);
+ file->default_idx = -1;
+
+ if ( bsmpl->ignore_rg || !bam_hdr )
+ {
+ // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
+ bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
+ return bsmpl->nfiles-1;
+ }
+
+ void *bam_smpls = khash_str2int_init();
+ int first_smpl = -1, nskipped = 0;
+ const char *p = bam_hdr, *q, *r;
+ while ((q = strstr(p, "@RG")) != 0)
+ {
+ p = q + 3;
+ r = q = 0;
+ if ((q = strstr(p, "\tID:")) != 0) q += 4;
+ if ((r = strstr(p, "\tSM:")) != 0) r += 4;
+ if (r && q)
+ {
+ char *u, *v;
+ int ioq, ior;
+ for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
+ for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
+ ioq = *u; ior = *v; *u = *v = '\0';
+
+ // q now points to a null terminated read group id
+ // r points to a null terminated sample name
+ if ( !strcmp("*",q) || !strcmp("?",q) )
+ error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);
+
+ int accept_rg = 1;
+ if ( bsmpl->sample_list )
+ {
+ // restrict samples based on the -s/-S options
+ char *name = khash_str2str_get(bsmpl->sample_list,r);
+ if ( bsmpl->sample_logic==0 )
+ accept_rg = name ? 0 : 1;
+ else if ( !name )
+ accept_rg = 0;
+ else
+ r = name;
+ }
+ if ( accept_rg && bsmpl->rg_list )
+ {
+ // restrict readgroups based on the -G option, possibly renaming the sample
+ accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
+ }
+ if ( accept_rg )
+ bsmpl_add_readgroup(bsmpl,file,q,r);
+ else
+ {
+ bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
+ nskipped++;
+ }
+
+ if ( first_smpl<0 )
+ khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
+ if ( !khash_str2int_has_key(bam_smpls,r) )
+ khash_str2int_inc(bam_smpls,strdup(r));
+
+ *u = ioq; *v = ior;
+ }
+ else
+ break;
+ p = q > r ? q : r;
+ }
+ int nsmpls = khash_str2int_size(bam_smpls);
+ khash_str2int_destroy_free(bam_smpls);
+
+ const char *smpl_name = NULL;
+ int accept_null_rg = 1;
+ if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
+ if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;
+
+ if ( !accept_null_rg && first_smpl==-1 )
+ {
+ // no suitable read group is available in this bam: ignore the whole file.
+ free(file->fname);
+ bsmpl->nfiles--;
+ return -1;
+ }
+ if ( !accept_null_rg ) return bsmpl->nfiles-1;
+ if ( nsmpls==1 && !nskipped )
+ {
+ file->default_idx = first_smpl;
+ return bsmpl->nfiles-1;
+ }
+ if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];
+
+ bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
+ return bsmpl->nfiles-1;
+}
+
+const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl)
+{
+ *nsmpl = bsmpl->nsmpl;
+ return (const char**)bsmpl->smpl;
+}
+
+int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec)
+{
+ file_t *file = &bsmpl->files[bam_id];
+ if ( file->default_idx >= 0 ) return file->default_idx;
+
+ char *aux_rg = (char*) bam_aux_get(bam_rec, "RG");
+ aux_rg = aux_rg ? aux_rg+1 : "?";
+
+ int rg_id;
+ if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id;
+ if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id;
+ return -1;
+}
+
+int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->sample_logic = 1;
+ else list++;
+
+ int i, nsamples = 0;
+ char **samples = hts_readlist(list, is_file, &nsamples);
+ if ( !nsamples ) return 0;
+
+ kstring_t ori = {0,0,0};
+ kstring_t ren = {0,0,0};
+
+ bsmpl->sample_list = khash_str2str_init();
+ for (i=0; i<nsamples; i++)
+ {
+ char *ptr = samples[i];
+ ori.l = ren.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ori);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ren);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
+ free(samples[i]);
+ }
+ free(samples);
+ free(ori.s);
+ free(ren.s);
+ return nsamples;
+}
+
+int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->rg_logic = 1;
+ else list++;
+
+ int i, nrows = 0;
+ char **rows = hts_readlist(list, is_file, &nrows);
+ if ( !nrows ) return 0;
+
+ kstring_t fld1 = {0,0,0};
+ kstring_t fld2 = {0,0,0};
+ kstring_t fld3 = {0,0,0};
+
+ bsmpl->rg_list = khash_str2str_init();
+ for (i=0; i<nrows; i++)
+ {
+ char *ptr = rows[i];
+ fld1.l = fld2.l = fld3.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld1);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld2);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld3);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( fld3.l )
+ {
+ // ID FILE SAMPLE
+ kputc('\t',&fld1);
+ kputs(fld2.s,&fld1);
+ fld2.l = 0;
+ kputs(fld3.s,&fld2);
+ }
+ // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
+ char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
+ if ( !value )
+ khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
+ else if ( strcmp(value,fld2.l?fld2.s:"\t") )
+ error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
+ free(rows[i]);
+ }
+ free(rows);
+ free(fld1.s);
+ free(fld2.s);
+ free(fld3.s);
+ return nrows;
+}
+
+
--- /dev/null
+#include "pysam.h"
+
+/* bam_sample.c -- group data by sample.
+
+ Copyright (C) 2010, 2011 Broad Institute.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <khash_str2str.h>
+#include "bam_sample.h"
+#include "bcftools.h"
+
+
+typedef struct
+{
+ char *fname;
+ void *rg2idx; // hash: read group name to BCF output sample index. Maintained by bsmpl_add_readgroup
+ int default_idx; // default BCF output sample index, set only when all readgroups are treated as one sample
+}
+file_t;
+
+struct _bam_smpl_t
+{
+ kstring_t tmp;
+ file_t *files;
+ int ignore_rg, nsmpl, nfiles;
+ char **smpl; // list of BCF output sample names. Maintained by bsmpl_add_readgroup
+ void *sample_list; // hash: BAM input sample name to BCF output sample name. This is the -s/-S list
+ int sample_logic; // the -s/-S logic, 1: include, 0: exclude
+ void *rg_list; // hash: BAM/rg_id to sample name or */rg_id for global ids. This is the -G list
+ int rg_logic; // the -G logic, 1: include, 0: exclude
+ void *name2idx; // hash: BCF output sample name to BCF output sample index. Maintained by bsmpl_add_readgroup
+};
+
+bam_smpl_t *bam_smpl_init(void)
+{
+ bam_smpl_t *bsmpl;
+ bsmpl = (bam_smpl_t*) calloc(1, sizeof(bam_smpl_t));
+ bsmpl->name2idx = khash_str2int_init();
+ return bsmpl;
+}
+
+void bam_smpl_destroy(bam_smpl_t *bsmpl)
+{
+ if ( !bsmpl ) return;
+ if ( bsmpl->name2idx ) khash_str2int_destroy_free(bsmpl->name2idx);
+ if ( bsmpl->sample_list ) khash_str2str_destroy_free_all(bsmpl->sample_list);
+ if ( bsmpl->rg_list ) khash_str2str_destroy_free_all(bsmpl->rg_list);
+ int i;
+ for (i=0; i<bsmpl->nfiles; i++)
+ {
+ file_t *file = &bsmpl->files[i];
+ if ( file->rg2idx ) khash_str2int_destroy_free(file->rg2idx);
+ free(file->fname);
+ }
+ free(bsmpl->smpl);
+ free(bsmpl->files);
+ free(bsmpl->tmp.s);
+ free(bsmpl);
+}
+
+void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl)
+{
+ bsmpl->ignore_rg = 1;
+}
+
+static void bsmpl_add_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char *smpl_name)
+{
+ int ismpl = -1;
+ if ( smpl_name )
+ {
+ if ( khash_str2int_get(bsmpl->name2idx,smpl_name,&ismpl) < 0 )
+ {
+ // new sample
+ bsmpl->nsmpl++;
+ bsmpl->smpl = (char**) realloc(bsmpl->smpl,sizeof(char*)*bsmpl->nsmpl);
+ bsmpl->smpl[bsmpl->nsmpl-1] = strdup(smpl_name);
+ ismpl = khash_str2int_inc(bsmpl->name2idx,bsmpl->smpl[bsmpl->nsmpl-1]);
+ }
+ }
+ if ( !strcmp("*",rg_id) )
+ {
+ // all read groups in the bam treated as the same sample
+ file->default_idx = ismpl;
+ return;
+ }
+ if ( !file->rg2idx ) file->rg2idx = khash_str2int_init();
+ if ( khash_str2int_has_key(file->rg2idx,rg_id) ) return; // duplicate @RG:ID
+ khash_str2int_set(file->rg2idx, strdup(rg_id), ismpl);
+}
+static int bsmpl_keep_readgroup(bam_smpl_t *bsmpl, file_t *file, const char *rg_id, const char **smpl_name)
+{
+ char *rg_smpl = khash_str2str_get(bsmpl->rg_list,rg_id); // unique read group present in one bam only
+ if ( !rg_smpl )
+ {
+ // read group specific to this bam
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"%s\t%s",rg_id,file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl )
+ {
+ // any read group in this file?
+ bsmpl->tmp.l = 0;
+ ksprintf(&bsmpl->tmp,"*\t%s",file->fname);
+ rg_smpl = khash_str2str_get(bsmpl->rg_list,bsmpl->tmp.s);
+ }
+ if ( !rg_smpl && bsmpl->rg_logic ) return 0;
+ if ( rg_smpl && !bsmpl->rg_logic ) return 0;
+
+ if ( rg_smpl && rg_smpl[0]!='\t' ) *smpl_name = rg_smpl; // rename the sample
+ return 1;
+}
+
+/*
+ The logic of this function is a bit complicated because we want to work
+ also with broken bams containing read groups that are not listed in the
+ header. The desired behavior is as follows:
+ - when -G is given, read groups which are not listed in the header must
+ be given explicitly using the "?" symbol in -G.
+ Otherwise:
+ - if the bam has no header, all reads in the file are assigned to a
+ single sample named after the file
+ - if there is at least one sample defined in the header, reads with no
+ read group id or with a read group id not listed in the header are
+ assigned to the first sample encountered in the header
+*/
+int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname)
+{
+ bsmpl->nfiles++;
+ bsmpl->files = (file_t*) realloc(bsmpl->files,bsmpl->nfiles*sizeof(file_t));
+ file_t *file = &bsmpl->files[bsmpl->nfiles-1];
+ memset(file,0,sizeof(file_t));
+ file->fname = strdup(fname);
+ file->default_idx = -1;
+
+ if ( bsmpl->ignore_rg || !bam_hdr )
+ {
+ // The option --ignore-RG is set or there is no BAM header: use the file name as the sample name
+ bsmpl_add_readgroup(bsmpl,file,"*",file->fname);
+ return bsmpl->nfiles-1;
+ }
+
+ void *bam_smpls = khash_str2int_init();
+ int first_smpl = -1, nskipped = 0;
+ const char *p = bam_hdr, *q, *r;
+ while ((q = strstr(p, "@RG")) != 0)
+ {
+ p = q + 3;
+ r = q = 0;
+ if ((q = strstr(p, "\tID:")) != 0) q += 4;
+ if ((r = strstr(p, "\tSM:")) != 0) r += 4;
+ if (r && q)
+ {
+ char *u, *v;
+ int ioq, ior;
+ for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
+ for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
+ ioq = *u; ior = *v; *u = *v = '\0';
+
+ // q now points to a null terminated read group id
+ // r points to a null terminated sample name
+ if ( !strcmp("*",q) || !strcmp("?",q) )
+ error("Error: the read group IDs \"*\" and \"?\" have a special meaning in the mpileup code. Please fix the code or the bam: %s\n", fname);
+
+ int accept_rg = 1;
+ if ( bsmpl->sample_list )
+ {
+ // restrict samples based on the -s/-S options
+ char *name = khash_str2str_get(bsmpl->sample_list,r);
+ if ( bsmpl->sample_logic==0 )
+ accept_rg = name ? 0 : 1;
+ else if ( !name )
+ accept_rg = 0;
+ else
+ r = name;
+ }
+ if ( accept_rg && bsmpl->rg_list )
+ {
+ // restrict readgroups based on the -G option, possibly renaming the sample
+ accept_rg = bsmpl_keep_readgroup(bsmpl,file,q,&r);
+ }
+ if ( accept_rg )
+ bsmpl_add_readgroup(bsmpl,file,q,r);
+ else
+ {
+ bsmpl_add_readgroup(bsmpl,file,q,NULL); // ignore this RG but note that it was seen in the header
+ nskipped++;
+ }
+
+ if ( first_smpl<0 )
+ khash_str2int_get(bsmpl->name2idx,r,&first_smpl);
+ if ( !khash_str2int_has_key(bam_smpls,r) )
+ khash_str2int_inc(bam_smpls,strdup(r));
+
+ *u = ioq; *v = ior;
+ }
+ else
+ break;
+ p = q > r ? q : r;
+ }
+ int nsmpls = khash_str2int_size(bam_smpls);
+ khash_str2int_destroy_free(bam_smpls);
+
+ const char *smpl_name = NULL;
+ int accept_null_rg = 1;
+ if ( bsmpl->rg_list && !bsmpl_keep_readgroup(bsmpl,file,"?",&smpl_name) ) accept_null_rg = 0;
+ if ( bsmpl->sample_list && first_smpl==-1 ) accept_null_rg = 0;
+
+ if ( !accept_null_rg && first_smpl==-1 )
+ {
+ // no suitable read group is available in this bam: ignore the whole file.
+ free(file->fname);
+ bsmpl->nfiles--;
+ return -1;
+ }
+ if ( !accept_null_rg ) return bsmpl->nfiles-1;
+ if ( nsmpls==1 && !nskipped )
+ {
+ file->default_idx = first_smpl;
+ return bsmpl->nfiles-1;
+ }
+ if ( !smpl_name ) smpl_name = first_smpl==-1 ? file->fname : bsmpl->smpl[first_smpl];
+
+ bsmpl_add_readgroup(bsmpl,file,"?",smpl_name);
+ return bsmpl->nfiles-1;
+}
+
+const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl)
+{
+ *nsmpl = bsmpl->nsmpl;
+ return (const char**)bsmpl->smpl;
+}
+
+int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec)
+{
+ file_t *file = &bsmpl->files[bam_id];
+ if ( file->default_idx >= 0 ) return file->default_idx;
+
+ char *aux_rg = (char*) bam_aux_get(bam_rec, "RG");
+ aux_rg = aux_rg ? aux_rg+1 : "?";
+
+ int rg_id;
+ if ( khash_str2int_get(file->rg2idx, aux_rg, &rg_id)==0 ) return rg_id;
+ if ( khash_str2int_get(file->rg2idx, "?", &rg_id)==0 ) return rg_id;
+ return -1;
+}
+
+int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->sample_logic = 1;
+ else list++;
+
+ int i, nsamples = 0;
+ char **samples = hts_readlist(list, is_file, &nsamples);
+ if ( !nsamples ) return 0;
+
+ kstring_t ori = {0,0,0};
+ kstring_t ren = {0,0,0};
+
+ bsmpl->sample_list = khash_str2str_init();
+ for (i=0; i<nsamples; i++)
+ {
+ char *ptr = samples[i];
+ ori.l = ren.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ori);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &ren);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
+ free(samples[i]);
+ }
+ free(samples);
+ free(ori.s);
+ free(ren.s);
+ return nsamples;
+}
+
+int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
+{
+ if ( list[0]!='^' ) bsmpl->rg_logic = 1;
+ else list++;
+
+ int i, nrows = 0;
+ char **rows = hts_readlist(list, is_file, &nrows);
+ if ( !nrows ) return 0;
+
+ kstring_t fld1 = {0,0,0};
+ kstring_t fld2 = {0,0,0};
+ kstring_t fld3 = {0,0,0};
+
+ bsmpl->rg_list = khash_str2str_init();
+ for (i=0; i<nrows; i++)
+ {
+ char *ptr = rows[i];
+ fld1.l = fld2.l = fld3.l = 0;
+ int escaped = 0;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld1);
+ escaped = 0;
+ ptr++;
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld2);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( *ptr )
+ {
+ while ( *ptr && isspace(*ptr) ) ptr++;
+ while ( *ptr )
+ {
+ if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
+ if ( isspace(*ptr) && !escaped ) break;
+ kputc(*ptr, &fld3);
+ escaped = 0;
+ ptr++;
+ }
+ }
+ if ( fld3.l )
+ {
+ // ID FILE SAMPLE
+ kputc('\t',&fld1);
+ kputs(fld2.s,&fld1);
+ fld2.l = 0;
+ kputs(fld3.s,&fld2);
+ }
+ // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
+ char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
+ if ( !value )
+ khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
+ else if ( strcmp(value,fld2.l?fld2.s:"\t") )
+ error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
+ free(rows[i]);
+ }
+ free(rows);
+ free(fld1.s);
+ free(fld2.s);
+ free(fld3.s);
+ return nrows;
+}
+
+
--- /dev/null
+/* bam_sample.h -- group data by sample.
+
+ Copyright (C) 2010 Broad Institute.
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Heng Li <lh3@sanger.ac.uk>, Petr Danecek <pd3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#ifndef BAM_SAMPLE_H
+#define BAM_SAMPLE_H
+
+#include <htslib/sam.h>
+
+typedef struct _bam_smpl_t bam_smpl_t;
+
+bam_smpl_t *bam_smpl_init(void);
+
+int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file);
+int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file);
+void bam_smpl_ignore_readgroups(bam_smpl_t* bsmpl);
+
+// The above should be called only before bams are added. Returns the BAM id
+// to be passed to bam_smpl_get_sample_id() later. It is safe to assume
+// sequential numbering, starting from 0.
+//
+int bam_smpl_add_bam(bam_smpl_t *bsmpl, char *bam_hdr, const char *fname);
+
+const char **bam_smpl_get_samples(bam_smpl_t *bsmpl, int *nsmpl);
+int bam_smpl_get_sample_id(bam_smpl_t *bsmpl, int bam_id, bam1_t *bam_rec);
+
+void bam_smpl_destroy(bam_smpl_t *bsmpl);
+
+#endif
#include <htslib/vcf.h>
#include <math.h>
+#define FT_TAB_TEXT 0 // custom tab-delimited text file
#define FT_GZ 1
#define FT_VCF 2
#define FT_VCF_GZ (FT_GZ|FT_VCF)
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include "bcftools.h"
+#include "bin.h"
+
+struct _bin_t
+{
+ float *bins;
+ int nbins;
+};
+
+bin_t *bin_init(const char *list_def, float min, float max)
+{
+ bin_t *bin = (bin_t*) calloc(1,sizeof(bin_t));
+
+ // a comma indicates a list, otherwise a file
+ int is_file = strchr(list_def,',') ? 0 : 1;
+ int i, nlist;
+ char **list = hts_readlist(list_def, is_file, &nlist);
+ bin->nbins = nlist;
+ bin->bins = (float*) malloc(sizeof(float)*nlist);
+ for (i=0; i<nlist; i++)
+ {
+ char *tmp;
+ bin->bins[i] = strtod(list[i],&tmp);
+ if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
+ if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
+ error("Expected values from the interval [%f,%f], found %s\n", list[i]);
+ free(list[i]);
+ }
+ free(list);
+
+ if ( min!=max )
+ {
+ // make sure we've got both boundaries: min,max.
+ assert( nlist>1 );
+ float max_err = (bin->bins[1] - bin->bins[0])*1e-6;
+ if ( fabs(bin->bins[0] - min) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ memmove(bin->bins+1, bin->bins, sizeof(float)*(bin->nbins-1));
+ bin->bins[0] = min;
+ }
+ if ( fabs(bin->bins[bin->nbins-1] - max) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ bin->bins[bin->nbins-1] = max;
+ }
+ }
+ return bin;
+}
+
+void bin_destroy(bin_t *bin)
+{
+ free(bin->bins);
+ free(bin);
+}
+
+int bin_get_size(bin_t *bin) { return bin->nbins; }
+
+float bin_get_value(bin_t *bin, int idx) { return bin->bins[idx]; }
+
+int bin_get_idx(bin_t *bin, float value)
+{
+ if ( bin->bins[bin->nbins-1] < value ) return bin->nbins-1;
+
+ // Binary search in half-closed,half-open intervals [)
+ int imin = 0, imax = bin->nbins - 2;
+ while ( imin<imax )
+ {
+ int i = (imin+imax)/2;
+ if ( value < bin->bins[i] ) imax = i - 1;
+ else if ( value > bin->bins[i] ) imin = i + 1;
+ else return i;
+ }
+ if ( bin->bins[imax] <= value ) return imax;
+ return imin - 1;
+}
+
--- /dev/null
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <stdio.h>
+#include "bcftools.h"
+#include "bin.h"
+
+struct _bin_t
+{
+ float *bins;
+ int nbins;
+};
+
+bin_t *bin_init(const char *list_def, float min, float max)
+{
+ bin_t *bin = (bin_t*) calloc(1,sizeof(bin_t));
+
+ // a comma indicates a list, otherwise a file
+ int is_file = strchr(list_def,',') ? 0 : 1;
+ int i, nlist;
+ char **list = hts_readlist(list_def, is_file, &nlist);
+ bin->nbins = nlist;
+ bin->bins = (float*) malloc(sizeof(float)*nlist);
+ for (i=0; i<nlist; i++)
+ {
+ char *tmp;
+ bin->bins[i] = strtod(list[i],&tmp);
+ if ( !tmp ) error("Could not parse %s: %s\n", list_def, list[i]);
+ if ( min!=max && (bin->bins[i]<min || bin->bins[i]>max) )
+ error("Expected values from the interval [%f,%f], found %s\n", list[i]);
+ free(list[i]);
+ }
+ free(list);
+
+ if ( min!=max )
+ {
+ // make sure we've got both boundaries: min,max.
+ assert( nlist>1 );
+ float max_err = (bin->bins[1] - bin->bins[0])*1e-6;
+ if ( fabs(bin->bins[0] - min) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ memmove(bin->bins+1, bin->bins, sizeof(float)*(bin->nbins-1));
+ bin->bins[0] = min;
+ }
+ if ( fabs(bin->bins[bin->nbins-1] - max) > max_err )
+ {
+ bin->bins = (float*) realloc(bin->bins, (++bin->nbins)*sizeof(float));
+ bin->bins[bin->nbins-1] = max;
+ }
+ }
+ return bin;
+}
+
+void bin_destroy(bin_t *bin)
+{
+ free(bin->bins);
+ free(bin);
+}
+
+int bin_get_size(bin_t *bin) { return bin->nbins; }
+
+float bin_get_value(bin_t *bin, int idx) { return bin->bins[idx]; }
+
+int bin_get_idx(bin_t *bin, float value)
+{
+ if ( bin->bins[bin->nbins-1] < value ) return bin->nbins-1;
+
+ // Binary search in half-closed,half-open intervals [)
+ int imin = 0, imax = bin->nbins - 2;
+ while ( imin<imax )
+ {
+ int i = (imin+imax)/2;
+ if ( value < bin->bins[i] ) imax = i - 1;
+ else if ( value > bin->bins[i] ) imin = i + 1;
+ else return i;
+ }
+ if ( bin->bins[imax] <= value ) return imax;
+ return imin - 1;
+}
+
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Simple binning of float values into predefined bins
+*/
+
+#ifndef __BIN_H__
+#define __BIN_H__
+
+#include <stdio.h>
+
+typedef struct _bin_t bin_t;
+
+/*
+ * bin_init() - init bins
+ * @list: list of half-open intervals [). If the list does not contain commas,
+ * it is interpreted as a file name.
+ * @min,max: extreme values. This is for user convenience so that well-known
+ * extremes can be left out from the list. Ignored if min=max
+ */
+bin_t *bin_init(const char *list, float min, float max);
+void bin_destroy(bin_t *bin);
+
+/*
+ * bin_get_size() - number of boundaries, subtract 1 to get the number of bins
+ */
+int bin_get_size(bin_t *bin);
+
+/*
+ bin_get_idx() - find the bin index which corresponds to the value (binary search)
+ Returns the bin index 0 <= idx <= size-2 or -1,size-1 for out of range values.
+ */
+int bin_get_idx(bin_t *bin, float value);
+
+/*
+ bin_get_value() - get the i-th boundary value, i=0,..,size-1
+ */
+float bin_get_value(bin_t *bin, int ith);
+
+#endif
+
double trio_Pm_SNPs, trio_Pm_del, trio_Pm_ins; // P(mendelian) for trio calling, see mcall_call_trio_genotypes()
int32_t *ugts, *cgts; // unconstraind and constrained GTs
uint32_t output_tags;
+ char *prior_AN, *prior_AC; // reference panel AF tags (AF=AC/AN)
// ccall only
double indel_frac, min_perm_p, min_lrt;
void error(const char *format, ...);
/*
- * *call() - return negative value on error or the number of non-reference
+ * call() - return -1 value on critical error; -2 to skip the site; or the number of non-reference
* alleles on success.
*/
int mcall(call_t *call, bcf1_t *rec); // multiallic and rare-variant calling model
bcf_update_info_string(call->hdr, rec, "CGT", tmp);
}
}
- if (pr == 0) return 1;
-
is_var = (pr->p_ref < call->pref);
r = is_var? pr->p_ref : pr->p_var;
// Remove unused alleles
int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
- if ( call->flag & CALL_KEEPALT && call->unseen>0 )
- {
- assert( call->unseen==nals-1 );
- nals--;
- }
+ if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--;
if ( nals<rec->n_allele )
{
int i;
for (i=0; i<rec->n_sample; i++)
{
- int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2;
+ int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2;
int gt = x&3;
if ( !call->ploidy || call->ploidy[i]==2 )
{
bcf_update_info_string(call->hdr, rec, "CGT", tmp);
}
}
- if (pr == 0) return 1;
-
is_var = (pr->p_ref < call->pref);
r = is_var? pr->p_ref : pr->p_var;
// Remove unused alleles
int nals_ori = rec->n_allele, nals = !is_var && !(call->flag & CALL_KEEPALT) ? 1 : pr->rank0 < 2? 2 : pr->rank0+1;
- if ( call->flag & CALL_KEEPALT && call->unseen>0 )
- {
- assert( call->unseen==nals-1 );
- nals--;
- }
+ if ( call->flag & CALL_KEEPALT && call->unseen==nals-1 ) nals--;
if ( nals<rec->n_allele )
{
int i;
for (i=0; i<rec->n_sample; i++)
{
- int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i) : 2;
+ int x = ( is_var || call->output_tags & CALL_FMT_GQ ) ? bcf_p1_call_gt(p1, pr->f_exp, i, is_var) : 2;
int gt = x&3;
if ( !call->ploidy || call->ploidy[i]==2 )
{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
-#include <htslib/regidx.h>
+#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
int nvcf_buf, rid;
regidx_t *mask;
+ regitr_t *itr;
int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
{
args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
if ( !args->mask ) error("Failed to initialize mask regions\n");
+ args->itr = regitr_init(args->mask);
}
// In case we want to store the chains
if ( args->chain_fname )
free(args->vcf_buf);
free(args->fa_buf.s);
if ( args->mask ) regidx_destroy(args->mask);
+ if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
+ int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
- if ( idx<0 || idx>=args->fa_buf.l )
+ if ( idx<0 )
+ {
+ fprintf(stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ return;
+ }
+ if ( rec->rlen > args->fa_buf.l - idx )
+ {
+ rec->rlen = args->fa_buf.l - idx;
+ alen = strlen(rec->d.allele[ialt]);
+ if ( alen > rec->rlen )
+ {
+ rec->d.allele[ialt][rec->rlen] = 0;
+ fprintf(stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ }
+ }
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
- int len_diff = 0, alen = 0;
if ( rec->d.allele[ialt][0]=='<' )
{
if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
- regitr_t itr;
- if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return;
+ if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
- while ( REGITR_OVERLAP(itr,start,end) )
+ while ( regitr_overlap(args->itr) )
{
- idx_start = REGITR_START(itr) - start;
- idx_end = REGITR_END(itr) - start;
+ idx_start = args->itr->beg - start;
+ idx_end = args->itr->end - start;
if ( idx_start < 0 ) idx_start = 0;
if ( idx_end >= len ) idx_end = len - 1;
for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
- itr.i++;
}
}
{
if ( str.s[0]=='>' )
{
- // new sequence encountered, apply all chached variants
+ // new sequence encountered, apply all cached variants
while ( args->vcf_rbuf.n )
{
if (args->chain) {
}
if ( !rec_ptr ) flush_fa_buffer(args, 60);
}
- if (args->chain) {
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
+ {
+ bcf1_t *rec = *rec_ptr;
+ if ( rec->rid!=args->rid ) break;
+ if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break;
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
+ apply_variant(args, rec);
+ }
+ if (args->chain)
+ {
print_chain(args);
destroy_chain(args);
}
static void usage(args_t *args)
{
fprintf(stderr, "\n");
- fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference\n");
- fprintf(stderr, " fasta file.\n");
+ fprintf(stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
+ fprintf(stderr, " file. By default, the program will apply all ALT variants. Using the\n");
+ fprintf(stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
+ fprintf(stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
+ fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <getopt.h>
#include <unistd.h>
#include <htslib/kstring.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/kseq.h>
-#include <htslib/regidx.h>
+#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
int nvcf_buf, rid;
regidx_t *mask;
+ regitr_t *itr;
int chain_id; // chain_id, to provide a unique ID to each chain in the chain output
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
{
args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
if ( !args->mask ) error("Failed to initialize mask regions\n");
+ args->itr = regitr_init(args->mask);
}
// In case we want to store the chains
if ( args->chain_fname )
free(args->vcf_buf);
free(args->fa_buf.s);
if ( args->mask ) regidx_destroy(args->mask);
+ if ( args->itr ) regitr_destroy(args->itr);
if ( args->chain_fname )
if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname);
if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname);
rec->d.allele[1][0] = gt2iupac(ial,jal);
}
+ int len_diff = 0, alen = 0;
int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
- if ( idx<0 || idx>=args->fa_buf.l )
+ if ( idx<0 )
+ {
+ fprintf(pysam_stderr,"Warning: ignoring overlapping variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ return;
+ }
+ if ( rec->rlen > args->fa_buf.l - idx )
+ {
+ rec->rlen = args->fa_buf.l - idx;
+ alen = strlen(rec->d.allele[ialt]);
+ if ( alen > rec->rlen )
+ {
+ rec->d.allele[ialt][rec->rlen] = 0;
+ fprintf(pysam_stderr,"Warning: trimming variant starting at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
+ }
+ }
+ if ( idx>=args->fa_buf.l )
error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);
// sanity check the reference base
- int len_diff = 0, alen = 0;
if ( rec->d.allele[ialt][0]=='<' )
{
if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
int start = args->fa_src_pos - len;
int end = args->fa_src_pos;
- regitr_t itr;
- if ( !regidx_overlap(args->mask, chr,start,end, &itr) ) return;
+ if ( !regidx_overlap(args->mask, chr,start,end, args->itr) ) return;
int idx_start, idx_end, i;
- while ( REGITR_OVERLAP(itr,start,end) )
+ while ( regitr_overlap(args->itr) )
{
- idx_start = REGITR_START(itr) - start;
- idx_end = REGITR_END(itr) - start;
+ idx_start = args->itr->beg - start;
+ idx_end = args->itr->end - start;
if ( idx_start < 0 ) idx_start = 0;
if ( idx_end >= len ) idx_end = len - 1;
for (i=idx_start; i<=idx_end; i++) seq[i] = 'N';
- itr.i++;
}
}
{
if ( str.s[0]=='>' )
{
- // new sequence encountered, apply all chached variants
+ // new sequence encountered, apply all cached variants
while ( args->vcf_rbuf.n )
{
if (args->chain) {
}
if ( !rec_ptr ) flush_fa_buffer(args, 60);
}
- if (args->chain) {
+ bcf1_t **rec_ptr = NULL;
+ while ( args->rid>=0 && (rec_ptr = next_vcf_line(args)) )
+ {
+ bcf1_t *rec = *rec_ptr;
+ if ( rec->rid!=args->rid ) break;
+ if ( args->fa_end_pos && rec->pos > args->fa_end_pos ) break;
+ if ( args->fa_ori_pos + args->fa_buf.l - args->fa_mod_off <= rec->pos ) break;
+ apply_variant(args, rec);
+ }
+ if (args->chain)
+ {
print_chain(args);
destroy_chain(args);
}
static void usage(args_t *args)
{
fprintf(pysam_stderr, "\n");
- fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference\n");
- fprintf(pysam_stderr, " fasta file.\n");
+ fprintf(pysam_stderr, "About: Create consensus sequence by applying VCF variants to a reference fasta\n");
+ fprintf(pysam_stderr, " file. By default, the program will apply all ALT variants. Using the\n");
+ fprintf(pysam_stderr, " --sample (and, optionally, --haplotype) option will apply genotype\n");
+ fprintf(pysam_stderr, " (or haplotype) calls from FORMAT/GT. The program ignores allelic depth\n");
+ fprintf(pysam_stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(pysam_stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(pysam_stderr, "Options:\n");
fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#define T_IUPAC_GT 23
#define T_GT_TO_HAP 24 // not publicly advertised
#define T_GT_TO_HAP2 25 // not publicly advertised
+#define T_TBCSQ 26
+#define T_END 27
+#define T_POS0 28
+#define T_END0 29
typedef struct _fmt_t
{
int type, id, is_gt_field, ready, subscript;
char *key;
bcf_fmt_t *fmt;
+ void *usr; // user data (optional)
void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *);
+ void (*destroy)(void*); // clean user data (optional)
}
fmt_t;
int allow_undef_tags;
};
+typedef struct
+{
+ kstring_t hap1,hap2;
+ char **str;
+ int n, m;
+}
+bcsq_t;
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
+static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
+static void process_end(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen, str); }
+static void process_end0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen-1, str); }
static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); }
static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); }
static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( bcf_float_is_missing(line->qual) ) kputc('.', str);
- else ksprintf(str, "%g", line->qual);
+ else kputd(line->qual, str);
}
static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
- case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,fmt->id) ) fmt->id = -1;
fmt->fmt = NULL;
if ( fmt->id >= 0 )
{
if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
kputc('.', str);
else
- ksprintf(str, "%g", ptr[fmt->subscript]);
+ kputd(ptr[fmt->subscript], str);
}
else if ( fmt->fmt->type != BCF_BT_CHAR )
{
}
if (l == 0) kputc('.', str);
}
+static void destroy_tbcsq(void *usr)
+{
+ if ( !usr ) return;
+ bcsq_t *csq = (bcsq_t*) usr;
+ free(csq->hap1.s);
+ free(csq->hap2.s);
+ if ( csq->n )
+ free(csq->str[0]);
+ free(csq->str);
+ free(csq);
+}
+static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ {
+ init_format(convert, line, fmt);
+
+ bcsq_t *csq;
+ if ( fmt->usr )
+ {
+ csq = (bcsq_t*) fmt->usr;
+ if ( csq->n )
+ free(csq->str[0]);
+ csq->n = 0;
+ }
+ else
+ csq = (bcsq_t*) calloc(1,sizeof(bcsq_t));
+ fmt->usr = csq;
+
+ int i=0, len = 0;
+ char *tmp = NULL;
+ if ( bcf_get_info_string(convert->header,line,fmt->key,&tmp,&len)<0 )
+ {
+ csq->n = 0;
+ return;
+ }
+ do
+ {
+ csq->n++;
+ hts_expand(char*, csq->n, csq->m, csq->str);
+ csq->str[ csq->n-1 ] = tmp + i;
+ while ( i<len && tmp[i]!=',' ) i++;
+ if ( i<len && tmp[i]==',' ) tmp[i++] = 0;
+ }
+ while ( i<len );
+ }
+
+ bcsq_t *csq = (bcsq_t*)fmt->usr;
+
+ if ( fmt->fmt==NULL || !csq->n ) return;
+
+ csq->hap1.l = 0;
+ csq->hap2.l = 0;
+
+ int mask = fmt->subscript==0 ? 3 : 1; // merge both haplotypes if subscript==0
+
+ #define BRANCH(type_t, nbits) { \
+ type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \
+ int i,j; \
+ if ( fmt->subscript<=0 || fmt->subscript==1 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=0; i<nbits; i+=2) \
+ if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+ } \
+ } \
+ if ( fmt->subscript<0 || fmt->subscript==2 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=1; i<nbits; i+=2) \
+ if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+ } \
+ } \
+ }
+ switch (fmt->fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(uint8_t, 8); break;
+ case BCF_BT_INT16: BRANCH(uint16_t,16); break;
+ case BCF_BT_INT32: BRANCH(uint32_t,32); break;
+ default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
+ }
+ #undef BRANCH
+
+ if ( !csq->hap1.l && !csq->hap2.l ) return;
+
+ if ( csq->hap1.l ) csq->hap1.s[--csq->hap1.l] = 0;
+ if ( csq->hap2.l ) csq->hap2.s[--csq->hap2.l] = 0;
+
+ if ( fmt->subscript<0 )
+ {
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ kputc_('\t', str);
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+ }
+ else if ( fmt->subscript<2 )
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ else
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+}
static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
init_format(convert, line, fmt);
if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; }
if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; }
if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; }
+ if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; }
}
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
// the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a
// heterozygous genotype of unknown phase.
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- {
- // Throw an error or silently proceed?
- //
- // for (i=0; i<convert->nsamples; i++) kputs(" ...", str);
- // return;
-
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- }
-
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? -", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 -", str); // first ALT allele
- else
- kputs("0 -", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ if ( ptr[1]==bcf_int8_vector_end )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
// same as process_gt_to_hap but converts haploid genotypes into diploid
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 -> 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? ?", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 1", str); // first ALT allele
- else
- kputs("0 0", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 -> 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
fmt->key = key ? strdup(key) : NULL;
fmt->is_gt_field = is_gtf;
fmt->subscript = -1;
+ fmt->usr = NULL;
+ fmt->destroy = NULL;
// Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags.
if ( key )
{
if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
+ else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; }
+ else if ( !strcmp("END",key) ) { fmt->type = T_END; }
+ else if ( !strcmp("END0",key) ) { fmt->type = T_END0; }
else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break;
case T_CHROM: fmt->handler = &process_chrom; break;
case T_POS: fmt->handler = &process_pos; break;
+ case T_POS0: fmt->handler = &process_pos0; break;
+ case T_END: fmt->handler = &process_end; break;
+ case T_END0: fmt->handler = &process_end0; break;
case T_ID: fmt->handler = &process_id; break;
case T_REF: fmt->handler = &process_ref; break;
case T_ALT: fmt->handler = &process_alt; break;
case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break;
- case T_LINE: fmt->handler = &process_line; break;
+ case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
- if ( key )
+ if ( key && fmt->type==T_INFO )
{
- if ( fmt->type==T_INFO )
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,fmt->id) )
{
- fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
- if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key);
+ fmt->id = -1;
+ convert->undef_info_tag = strdup(key);
}
}
return fmt;
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "TBCSQ") )
+ {
+ fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ if ( fmt->subscript==-1 )
+ {
+ if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
+ }
+ else fmt->subscript++;
+ }
else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "INFO") )
{
{
if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf);
else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
else if ( !strcmp(str.s, "ALT") )
default: p = parse_sep(convert, p, is_gtf); break;
}
}
+ if ( is_gtf )
+ error("Could not parse the format string, missing the square bracket \"]\": %s\n", convert->format_str);
if ( nsamples )
{
{
int i;
for (i=0; i<convert->nfmt; i++)
+ {
+ if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr);
free(convert->fmt[i].key);
+ }
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
{
if ( !convert->allow_undef_tags && convert->undef_info_tag )
- error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag);
+ error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag);
int l_ori = str->l;
bcf_unpack(line, convert->max_unpack);
str->l = 0;
for (i=0; i<convert->nfmt; i++)
{
- // Genotype fields
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
- while ( convert->fmt[j].is_gt_field )
+ while ( j<convert->nfmt && convert->fmt[j].is_gt_field )
{
convert->fmt[j].ready = 0;
j++;
}
for (js=0; js<convert->nsamples; js++)
{
+ // Here comes a hack designed for TBCSQ. When running on large files,
+ // such as 1000GP, there are too many empty fields in the output and
+ // it's very very slow. Therefore in case the handler does not add
+ // anything to the string, we trim all genotype fields enclosed in square
+ // brackets here. This may be changed in future, time will show...
+ size_t l_start = str->l;
+
int ks = convert->samples[js];
for (k=i; k<j; k++)
{
kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
}
else if ( convert->fmt[k].handler )
+ {
+ size_t l = str->l;
convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ if ( l==str->l ) { str->l = l_start; break; } // only TBCSQ does this
+ }
}
}
i = j-1;
}
else if ( convert->fmt[i].handler )
convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str);
+
}
return str->l - l_ori;
}
/* convert.c -- functions for converting between VCF/BCF and related formats.
- Copyright (C) 2013-2014 Genome Research Ltd.
+ Copyright (C) 2013-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#define T_IUPAC_GT 23
#define T_GT_TO_HAP 24 // not publicly advertised
#define T_GT_TO_HAP2 25 // not publicly advertised
+#define T_TBCSQ 26
+#define T_END 27
+#define T_POS0 28
+#define T_END0 29
typedef struct _fmt_t
{
int type, id, is_gt_field, ready, subscript;
char *key;
bcf_fmt_t *fmt;
+ void *usr; // user data (optional)
void (*handler)(convert_t *, bcf1_t *, struct _fmt_t *, int, kstring_t *);
+ void (*destroy)(void*); // clean user data (optional)
}
fmt_t;
int allow_undef_tags;
};
+typedef struct
+{
+ kstring_t hap1,hap2;
+ char **str;
+ int n, m;
+}
+bcsq_t;
static void process_chrom(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(convert->header->id[BCF_DT_CTG][line->rid].key, str); }
static void process_pos(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+1, str); }
+static void process_pos0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos, str); }
+static void process_end(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen, str); }
+static void process_end0(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputw(line->pos+line->rlen-1, str); }
static void process_id(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.id, str); }
static void process_ref(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str) { kputs(line->d.allele[0], str); }
static void process_alt(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
static void process_qual(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
if ( bcf_float_is_missing(line->qual) ) kputc('.', str);
- else ksprintf(str, "%g", line->qual);
+ else kputd(line->qual, str);
}
static void process_filter(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
case BCF_BT_INT8: if ( info->v1.i==bcf_int8_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT16: if ( info->v1.i==bcf_int16_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
case BCF_BT_INT32: if ( info->v1.i==bcf_int32_missing ) kputc('.', str); else kputw(info->v1.i, str); break;
- case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else ksprintf(str, "%g", info->v1.f); break;
+ case BCF_BT_FLOAT: if ( bcf_float_is_missing(info->v1.f) ) kputc('.', str); else kputd(info->v1.f, str); break;
case BCF_BT_CHAR: kputc(info->v1.i, str); break;
default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
case BCF_BT_INT8: BRANCH(int8_t, val==bcf_int8_missing, val==bcf_int8_vector_end, kputw(val, str)); break;
case BCF_BT_INT16: BRANCH(int16_t, val==bcf_int16_missing, val==bcf_int16_vector_end, kputw(val, str)); break;
case BCF_BT_INT32: BRANCH(int32_t, val==bcf_int32_missing, val==bcf_int32_vector_end, kputw(val, str)); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), ksprintf(str, "%g", val)); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(val), bcf_float_is_vector_end(val), kputd(val, str)); break;
default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
static void init_format(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, fmt->key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,fmt->id) ) fmt->id = -1;
fmt->fmt = NULL;
if ( fmt->id >= 0 )
{
if ( bcf_float_is_missing(ptr[fmt->subscript]) || bcf_float_is_vector_end(ptr[fmt->subscript]) )
kputc('.', str);
else
- ksprintf(str, "%g", ptr[fmt->subscript]);
+ kputd(ptr[fmt->subscript], str);
}
else if ( fmt->fmt->type != BCF_BT_CHAR )
{
}
if (l == 0) kputc('.', str);
}
+static void destroy_tbcsq(void *usr)
+{
+ if ( !usr ) return;
+ bcsq_t *csq = (bcsq_t*) usr;
+ free(csq->hap1.s);
+ free(csq->hap2.s);
+ if ( csq->n )
+ free(csq->str[0]);
+ free(csq->str);
+ free(csq);
+}
+static void process_tbcsq(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
+{
+ if ( !fmt->ready )
+ {
+ init_format(convert, line, fmt);
+
+ bcsq_t *csq;
+ if ( fmt->usr )
+ {
+ csq = (bcsq_t*) fmt->usr;
+ if ( csq->n )
+ free(csq->str[0]);
+ csq->n = 0;
+ }
+ else
+ csq = (bcsq_t*) calloc(1,sizeof(bcsq_t));
+ fmt->usr = csq;
+
+ int i=0, len = 0;
+ char *tmp = NULL;
+ if ( bcf_get_info_string(convert->header,line,fmt->key,&tmp,&len)<0 )
+ {
+ csq->n = 0;
+ return;
+ }
+ do
+ {
+ csq->n++;
+ hts_expand(char*, csq->n, csq->m, csq->str);
+ csq->str[ csq->n-1 ] = tmp + i;
+ while ( i<len && tmp[i]!=',' ) i++;
+ if ( i<len && tmp[i]==',' ) tmp[i++] = 0;
+ }
+ while ( i<len );
+ }
+
+ bcsq_t *csq = (bcsq_t*)fmt->usr;
+
+ if ( fmt->fmt==NULL || !csq->n ) return;
+
+ csq->hap1.l = 0;
+ csq->hap2.l = 0;
+
+ int mask = fmt->subscript==0 ? 3 : 1; // merge both haplotypes if subscript==0
+
+ #define BRANCH(type_t, nbits) { \
+ type_t *x = (type_t*)(fmt->fmt->p + isample*fmt->fmt->size); \
+ int i,j; \
+ if ( fmt->subscript<=0 || fmt->subscript==1 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=0; i<nbits; i+=2) \
+ if ( val & (mask<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap1); kputc_(',', &csq->hap1); } \
+ } \
+ } \
+ if ( fmt->subscript<0 || fmt->subscript==2 ) \
+ { \
+ for (j=0; j < fmt->fmt->n; j++) \
+ { \
+ type_t val = x[j]; \
+ if ( !val ) continue; \
+ for (i=1; i<nbits; i+=2) \
+ if ( val & (1<<i) ) { kputs(csq->str[(j*32+i)/2], &csq->hap2); kputc_(',', &csq->hap2); } \
+ } \
+ } \
+ }
+ switch (fmt->fmt->type)
+ {
+ case BCF_BT_INT8: BRANCH(uint8_t, 8); break;
+ case BCF_BT_INT16: BRANCH(uint16_t,16); break;
+ case BCF_BT_INT32: BRANCH(uint32_t,32); break;
+ default: error("Unexpected type: %d\n", fmt->fmt->type); exit(1); break;
+ }
+ #undef BRANCH
+
+ if ( !csq->hap1.l && !csq->hap2.l ) return;
+
+ if ( csq->hap1.l ) csq->hap1.s[--csq->hap1.l] = 0;
+ if ( csq->hap2.l ) csq->hap2.s[--csq->hap2.l] = 0;
+
+ if ( fmt->subscript<0 )
+ {
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ kputc_('\t', str);
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+ }
+ else if ( fmt->subscript<2 )
+ kputs(csq->hap1.l?csq->hap1.s:".", str);
+ else
+ kputs(csq->hap2.l?csq->hap2.s:".", str);
+}
static void init_format_iupac(convert_t *convert, bcf1_t *line, fmt_t *fmt)
{
init_format(convert, line, fmt);
if ( line_type & VCF_MNP ) { if (i) kputc(',',str); kputs("MNP", str); i++; }
if ( line_type & VCF_INDEL ) { if (i) kputc(',',str); kputs("INDEL", str); i++; }
if ( line_type & VCF_OTHER ) { if (i) kputc(',',str); kputs("OTHER", str); i++; }
+ if ( line_type & VCF_BND ) { if (i) kputc(',',str); kputs("BND", str); i++; }
}
static void process_line(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
// the allele (0/1) and the asterisk (*); e.g., "0* 1*" for a
// heterozygous genotype of unknown phase.
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- {
- // Throw an error or silently proceed?
- //
- // for (i=0; i<convert->nsamples; i++) kputs(" ...", str);
- // return;
-
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- }
-
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? -", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 -", str); // first ALT allele
- else
- kputs("0 -", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ if ( ptr[1]==bcf_int8_vector_end )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '-'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static void process_gt_to_hap2(convert_t *convert, bcf1_t *line, fmt_t *fmt, int isample, kstring_t *str)
{
// same as process_gt_to_hap but converts haploid genotypes into diploid
- int m, n, i;
-
- m = convert->ndat / sizeof(int32_t);
- n = bcf_get_genotypes(convert->header, line, &convert->dat, &m);
- convert->ndat = m * sizeof(int32_t);
-
- if ( n<=0 )
- error("Error parsing GT tag at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
- n /= convert->nsamples;
+ int i, gt_id = bcf_hdr_id2int(convert->header, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_FMT,gt_id) )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+ bcf_fmt_t *fmt_gt = NULL;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
+ if ( !fmt_gt )
+ error("FORMAT/GT tag not present at %s:%d\n", bcf_seqname(convert->header, line), line->pos+1);
+
+ // Alloc all memory in advance to avoid kput routines. The biggest allowed allele index is 99
+ if ( line->n_allele > 100 )
+ error("Too many alleles (%d) at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+ if ( ks_resize(str, str->l+convert->nsamples*8) != 0 )
+ error("Could not alloc %d bytes\n", str->l + convert->nsamples*8);
+
+ if ( fmt_gt->type!=BCF_BT_INT8 ) // todo: use BRANCH_INT if the VCF is valid
+ error("Uh, too many alleles (%d) or redundant BCF representation at %s:%d\n", line->n_allele, bcf_seqname(convert->header, line), line->pos+1);
+
+ int8_t *ptr = ((int8_t*) fmt_gt->p) - fmt_gt->n;
for (i=0; i<convert->nsamples; i++)
{
- int32_t *ptr = (int32_t*)convert->dat + i*n;
- int j;
- for (j=0; j<n; j++)
- if ( ptr[j]==bcf_int32_vector_end ) break;
-
- if (i>0) kputs(" ", str); // no space separation for first column
- if ( j==2 )
+ ptr += fmt_gt->n;
+ if ( ptr[0]==2 )
{
- // diploid
- if ( bcf_gt_is_missing(ptr[0]) || bcf_gt_is_missing(ptr[1]) ) {
- kputs("? ?", str);
+ if ( ptr[1]==3 ) /* 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 0|1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 0 -> 0|0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 0/0 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 0/1 */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
- else if ( bcf_gt_is_phased(ptr[1])) {
- ksprintf(str, "%d %d", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 0/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
}
- else {
- ksprintf(str, "%d* %d*", bcf_gt_allele(ptr[0]), bcf_gt_allele(ptr[1]));
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 0|x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 0/x */
+ {
+ str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
}
}
- else if ( j==1 )
+ else if ( ptr[0]==4 )
{
- // haploid
- if ( bcf_gt_is_missing(ptr[0]) )
- kputs("? ?", str);
- else if ( bcf_gt_allele(ptr[0])==1 )
- kputs("1 1", str); // first ALT allele
- else
- kputs("0 0", str); // REF or something else than first ALT
+ if ( ptr[1]==3 ) /* 1|0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==5 ) /* 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end ) /* 1 -> 1|1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==2 ) /* 1/0 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==4 ) /* 1/1 */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' '; str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_missing(ptr[1]) ) /* 1/. */
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( bcf_gt_is_phased(ptr[1]) ) /* 1|x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = ' ';
+ }
+ else /* 1/x */
+ {
+ str->s[str->l++] = '1'; str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ str->s[str->l++] = '*'; str->s[str->l++] = ' ';
+ }
+ }
+ else if ( bcf_gt_is_missing(ptr[0]) )
+ {
+ str->s[str->l++] = '?'; str->s[str->l++] = ' '; str->s[str->l++] = '?'; str->s[str->l++] = ' ';
+ }
+ else if ( ptr[1]==bcf_int8_vector_end )
+ {
+ /* use REF for something else than first ALT */
+ str->s[str->l++] = '0'; str->s[str->l++] = ' '; str->s[str->l++] = '0'; str->s[str->l++] = ' ';
+ }
+ else
+ {
+ kputw(bcf_gt_allele(ptr[0]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
+ kputw(bcf_gt_allele(ptr[1]),str);
+ if ( bcf_gt_is_phased(ptr[1]) ) str->s[str->l++] = '*';
+ str->s[str->l++] = ' ';
}
- else error("FIXME: not ready for ploidy %d\n", j);
}
+ str->s[--str->l] = 0; // delete the last space
}
static fmt_t *register_tag(convert_t *convert, int type, char *key, int is_gtf)
fmt->key = key ? strdup(key) : NULL;
fmt->is_gt_field = is_gtf;
fmt->subscript = -1;
+ fmt->usr = NULL;
+ fmt->destroy = NULL;
// Allow non-format tags, such as CHROM, INFO, etc., to appear amongst the format tags.
if ( key )
{
if ( !strcmp("CHROM",key) ) { fmt->type = T_CHROM; }
else if ( !strcmp("POS",key) ) { fmt->type = T_POS; }
+ else if ( !strcmp("POS0",key) ) { fmt->type = T_POS0; }
+ else if ( !strcmp("END",key) ) { fmt->type = T_END; }
+ else if ( !strcmp("END0",key) ) { fmt->type = T_END0; }
else if ( !strcmp("ID",key) ) { fmt->type = T_ID; }
else if ( !strcmp("REF",key) ) { fmt->type = T_REF; }
else if ( !strcmp("ALT",key) ) { fmt->type = T_ALT; }
case T_GP_TO_PROB3: fmt->handler = &process_gp_to_prob3; break;
case T_CHROM: fmt->handler = &process_chrom; break;
case T_POS: fmt->handler = &process_pos; break;
+ case T_POS0: fmt->handler = &process_pos0; break;
+ case T_END: fmt->handler = &process_end; break;
+ case T_END0: fmt->handler = &process_end0; break;
case T_ID: fmt->handler = &process_id; break;
case T_REF: fmt->handler = &process_ref; break;
case T_ALT: fmt->handler = &process_alt; break;
case T_IUPAC_GT: fmt->handler = &process_iupac_gt; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP: fmt->handler = &process_gt_to_hap; convert->max_unpack |= BCF_UN_FMT; break;
case T_GT_TO_HAP2: fmt->handler = &process_gt_to_hap2; convert->max_unpack |= BCF_UN_FMT; break;
- case T_LINE: fmt->handler = &process_line; break;
+ case T_TBCSQ: fmt->handler = &process_tbcsq; fmt->destroy = &destroy_tbcsq; convert->max_unpack |= BCF_UN_FMT; break;
+ case T_LINE: fmt->handler = &process_line; convert->max_unpack |= BCF_UN_FMT; break;
default: error("TODO: handler for type %d\n", fmt->type);
}
- if ( key )
+ if ( key && fmt->type==T_INFO )
{
- if ( fmt->type==T_INFO )
+ fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
+ if ( !bcf_hdr_idinfo_exists(convert->header,BCF_HL_INFO,fmt->id) )
{
- fmt->id = bcf_hdr_id2int(convert->header, BCF_DT_ID, key);
- if ( fmt->id==-1 ) convert->undef_info_tag = strdup(key);
+ fmt->id = -1;
+ convert->undef_info_tag = strdup(key);
}
}
return fmt;
if ( !strcmp(str.s, "SAMPLE") ) register_tag(convert, T_SAMPLE, "SAMPLE", is_gtf);
else if ( !strcmp(str.s, "GT") ) register_tag(convert, T_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "TGT") ) register_tag(convert, T_TGT, "GT", is_gtf);
+ else if ( !strcmp(str.s, "TBCSQ") )
+ {
+ fmt_t *fmt = register_tag(convert, T_TBCSQ, "BCSQ", is_gtf);
+ fmt->subscript = parse_subscript(&q);
+ if ( fmt->subscript==-1 )
+ {
+ if ( !strncmp(q,"{*}",3) ) { fmt->subscript = 0; q += 3; }
+ }
+ else fmt->subscript++;
+ }
else if ( !strcmp(str.s, "IUPACGT") ) register_tag(convert, T_IUPAC_GT, "GT", is_gtf);
else if ( !strcmp(str.s, "INFO") )
{
{
if ( !strcmp(str.s, "CHROM") ) register_tag(convert, T_CHROM, str.s, is_gtf);
else if ( !strcmp(str.s, "POS") ) register_tag(convert, T_POS, str.s, is_gtf);
+ else if ( !strcmp(str.s, "POS0") ) register_tag(convert, T_POS0, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END") ) register_tag(convert, T_END, str.s, is_gtf);
+ else if ( !strcmp(str.s, "END0") ) register_tag(convert, T_END0, str.s, is_gtf);
else if ( !strcmp(str.s, "ID") ) register_tag(convert, T_ID, str.s, is_gtf);
else if ( !strcmp(str.s, "REF") ) register_tag(convert, T_REF, str.s, is_gtf);
else if ( !strcmp(str.s, "ALT") )
default: p = parse_sep(convert, p, is_gtf); break;
}
}
+ if ( is_gtf )
+ error("Could not parse the format string, missing the square bracket \"]\": %s\n", convert->format_str);
if ( nsamples )
{
{
int i;
for (i=0; i<convert->nfmt; i++)
+ {
+ if ( convert->fmt[i].destroy ) convert->fmt[i].destroy(convert->fmt[i].usr);
free(convert->fmt[i].key);
+ }
free(convert->fmt);
free(convert->undef_info_tag);
free(convert->dat);
int convert_line(convert_t *convert, bcf1_t *line, kstring_t *str)
{
if ( !convert->allow_undef_tags && convert->undef_info_tag )
- error("Error: no such tag defined in the VCF header: INFO/%s\n", convert->undef_info_tag);
+ error("Error: no such tag defined in the VCF header: INFO/%s. FORMAT fields must be in square brackets, e.g. \"[ %s]\"\n", convert->undef_info_tag,convert->undef_info_tag);
int l_ori = str->l;
bcf_unpack(line, convert->max_unpack);
str->l = 0;
for (i=0; i<convert->nfmt; i++)
{
- // Genotype fields
+ // Genotype fields.
if ( convert->fmt[i].is_gt_field )
{
int j = i, js, k;
- while ( convert->fmt[j].is_gt_field )
+ while ( j<convert->nfmt && convert->fmt[j].is_gt_field )
{
convert->fmt[j].ready = 0;
j++;
}
for (js=0; js<convert->nsamples; js++)
{
+ // Here comes a hack designed for TBCSQ. When running on large files,
+ // such as 1000GP, there are too many empty fields in the output and
+ // it's very very slow. Therefore in case the handler does not add
+ // anything to the string, we trim all genotype fields enclosed in square
+ // brackets here. This may be changed in future, time will show...
+ size_t l_start = str->l;
+
int ks = convert->samples[js];
for (k=i; k<j; k++)
{
kputc(bcf_sr_has_line(convert->readers,ir)?'1':'0', str);
}
else if ( convert->fmt[k].handler )
+ {
+ size_t l = str->l;
convert->fmt[k].handler(convert, line, &convert->fmt[k], ks, str);
+ if ( l==str->l ) { str->l = l_start; break; } // only TBCSQ does this
+ }
}
}
i = j-1;
}
else if ( convert->fmt[i].handler )
convert->fmt[i].handler(convert, line, &convert->fmt[i], -1, str);
+
}
return str->l - l_ori;
}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Things that would be nice to have
+ - for stop-lost events (also in frameshifts) report the number of truncated aa's
+ - memory could be greatly reduced by indexing gff (but it is quite compact already)
+ - deletions that go beyond transcript boundaries are not checked at sequence level
+ - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+ - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+ Read about transcript types here
+ http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+ http://www.ensembl.org/info/genome/variation/predicted_data.html
+ http://www.gencodegenes.org/gencode_biotypes.html
+
+ List of supported biotypes
+ antisense
+ IG_C_gene
+ IG_D_gene
+ IG_J_gene
+ IG_LV_gene
+ IG_V_gene
+ lincRNA
+ macro_lncRNA
+ miRNA
+ misc_RNA
+ Mt_rRNA
+ Mt_tRNA
+ polymorphic_pseudogene
+ processed_transcript
+ protein_coding
+ ribozyme
+ rRNA
+ sRNA
+ scRNA
+ scaRNA
+ sense_intronic
+ sense_overlapping
+ snRNA
+ snoRNA
+ TR_C_gene
+ TR_D_gene
+ TR_J_gene
+ TR_V_gene
+
+ The gff parsing logic
+ We collect features such by combining gff lines A,B,C as follows:
+ A .. gene line with a supported biotype
+ A.ID=~/^gene:/
+
+ B .. transcript line referencing A
+ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+ C .. corresponding CDS, exon, and UTR lines:
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+ complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+ The supported consequence types, sorted by impact:
+ splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+ splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
+ stop_gained .. DNA sequence variant resulting in a stop codon
+ frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+ stop_lost .. elongated transcript, stop codon changed
+ start_lost .. the first codon changed
+ inframe_altering .. combination of indels leading to unchanged reading frame and length
+ inframe_insertion .. inserted coding sequence, unchanged reading frame
+ inframe_deletion .. deleted coding sequence, unchanged reading frame
+ missense_variant .. amino acid (aa) change, unchanged length
+ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron
+ synonymous_variant .. DNA sequence variant resulting in no amino acid change
+ stop_retained_variant .. different stop codon
+ non_coding_variant .. variant in non-coding sequence, such as RNA gene
+ 5_prime_UTR_variant
+ 3_prime_UTR_variant
+ intron_variant .. reported only if none of the above
+ intergenic_variant .. reported only if none of the above
+
+
+ The annotation algorithm.
+ The algorithm checks if the variant falls in a region of a supported type. The
+ search is performed in the following order, until a match is found:
+ 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+ 2. idx_utr(gf_utr_t) - check UTR hits
+ 3. idx_exon(gf_exon_t) - check for splice variants
+ 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+ These regidx indexes are created by parsing a gff3 file as follows:
+ 1. create the array "ftr" of all UTR, CDS, exons. This will be
+ processed later and pruned based on transcript types we want to keep.
+ In the same go, create the hash "id2tr" of transcripts to keep
+ (based on biotype) which maps from transcript_id to a transcript. At
+ the same time also build the hash "gid2gene" which maps from gene_id to
+ gf_gene_t pointer.
+
+ 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+ Use only features from "ftr" which are present in "id2tr".
+
+ 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+ Data structures.
+ idx_cds, idx_utr, idx_exon, idx_tscript:
+ as described above, regidx structures for fast lookup of exons/transcripts
+ overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "regidx.h"
+#include "kheap.h"
+#include "smpl_ilist.h"
+#include "rbuf.h"
+
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
+
+// Ensembl ID format, e.g.
+// ENST00000423372 for human .. ENST%011d
+// ENSMUST00000120394 for mouse .. ENSMUST%011d
+char ENSID_BUF[32], *ENSID_FMT = NULL;
+static inline char *ENSID(uint32_t id)
+{
+ sprintf(ENSID_BUF,ENSID_FMT,id);
+ return ENSID_BUF;
+}
+
+
+#define N_REF_PAD 10 // number of bases to avoid boundary effects
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE 0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+// How to treat phased/unphased genotypes
+#define PHASE_REQUIRE 0 // --phase r
+#define PHASE_MERGE 1 // --phase m
+#define PHASE_AS_IS 2 // --phase a
+#define PHASE_SKIP 3 // --phase s
+#define PHASE_NON_REF 4 // --phase R
+#define PHASE_DROP_GT 5 // --samples -
+
+// Node types in the haplotype tree
+#define HAP_CDS 0
+#define HAP_ROOT 1
+#define HAP_SSS 2 // start/stop/splice
+
+#define CSQ_PRINTED_UPSTREAM (1<<0)
+#define CSQ_SYNONYMOUS_VARIANT (1<<1)
+#define CSQ_MISSENSE_VARIANT (1<<2)
+#define CSQ_STOP_LOST (1<<3)
+#define CSQ_STOP_GAINED (1<<4)
+#define CSQ_INFRAME_DELETION (1<<5)
+#define CSQ_INFRAME_INSERTION (1<<6)
+#define CSQ_FRAMESHIFT_VARIANT (1<<7)
+#define CSQ_SPLICE_ACCEPTOR (1<<8)
+#define CSQ_SPLICE_DONOR (1<<9)
+#define CSQ_START_LOST (1<<10)
+#define CSQ_SPLICE_REGION (1<<11)
+#define CSQ_STOP_RETAINED (1<<12)
+#define CSQ_UTR5 (1<<13)
+#define CSQ_UTR3 (1<<14)
+#define CSQ_NON_CODING (1<<15)
+#define CSQ_INTRON (1<<16)
+//#define CSQ_INTERGENIC (1<<17)
+#define CSQ_INFRAME_ALTERING (1<<18)
+#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
+#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
+#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
+
+// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
+#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
+ CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
+ CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
+ CSQ_UPSTREAM_STOP)
+#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST)
+
+#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
+#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
+
+// see kput_vcsq()
+const char *csq_strings[] =
+{
+ NULL,
+ "synonymous",
+ "missense",
+ "stop_lost",
+ "stop_gained",
+ "inframe_deletion",
+ "inframe_insertion",
+ "frameshift",
+ "splice_acceptor",
+ "splice_donor",
+ "start_lost",
+ "splice_region",
+ "stop_retained",
+ "5_prime_utr",
+ "3_prime_utr",
+ "non_coding",
+ "intron",
+ "intergenic",
+ "inframe_altering",
+ NULL,
+ NULL,
+ "coding_sequence"
+};
+
+
+// GFF line types
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE 2
+
+
+/*
+ Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
+#define GF_MT_tRNA 2
+#define GF_lincRNA 3
+#define GF_miRNA 4
+#define GF_MISC_RNA 5
+#define GF_rRNA 6
+#define GF_snRNA 7
+#define GF_snoRNA 8
+#define GF_PROCESSED_TRANSCRIPT 9
+#define GF_ANTISENSE 10
+#define GF_macro_lncRNA 11
+#define GF_ribozyme 12
+#define GF_sRNA 13
+#define GF_scRNA 14
+#define GF_scaRNA 15
+#define GF_SENSE_INTRONIC 16
+#define GF_SENSE_OVERLAPPING 17
+#define GF_PSEUDOGENE 18
+#define GF_PROCESSED_PSEUDOGENE 19
+#define GF_ARTIFACT 20
+#define GF_IG_PSEUDOGENE 21
+#define GF_IG_C_PSEUDOGENE 22
+#define GF_IG_J_PSEUDOGENE 23
+#define GF_IG_V_PSEUDOGENE 24
+#define GF_TR_V_PSEUDOGENE 25
+#define GF_TR_J_PSEUDOGENE 26
+#define GF_MT_tRNA_PSEUDOGENE 27
+#define GF_misc_RNA_PSEUDOGENE 28
+#define GF_miRNA_PSEUDOGENE 29
+#define GF_RIBOZYME 30
+#define GF_RETAINED_INTRON 31
+#define GF_RETROTRANSPOSED 32
+#define GF_tRNA_PSEUDOGENE 33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
+#define GF_KNOWN_NCRNA 39
+#define GF_UNITARY_PSEUDOGENE 40
+#define GF_UNPROCESSED_PSEUDOGENE 41
+#define GF_LRG_GENE 42
+#define GF_3PRIME_OVERLAPPING_ncRNA 43
+#define GF_DISRUPTED_DOMAIN 44
+#define GF_vaultRNA 45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
+#define GF_AMBIGUOUS_ORF 47
+#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
+#define GF_IG_C (3|(1<<GF_coding_bit))
+#define GF_IG_D (4|(1<<GF_coding_bit))
+#define GF_IG_J (5|(1<<GF_coding_bit))
+#define GF_IG_LV (6|(1<<GF_coding_bit))
+#define GF_IG_V (7|(1<<GF_coding_bit))
+#define GF_TR_C (8|(1<<GF_coding_bit))
+#define GF_TR_D (9|(1<<GF_coding_bit))
+#define GF_TR_J (10|(1<<GF_coding_bit))
+#define GF_TR_V (11|(1<<GF_coding_bit))
+#define GF_NMD (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
+#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
+#define GF_EXON ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+typedef struct _tscript_t tscript_t;
+typedef struct
+{
+ tscript_t *tr; // transcript
+ uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
+ uint32_t pos; // 0-based index of the first exon base within the transcript (only to
+ // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+ uint32_t len; // exon length
+ uint32_t icds:30, // exon index within the transcript
+ phase:2; // offset of the CDS
+}
+gf_cds_t;
+typedef struct
+{
+ char *name; // human readable name, e.g. ORF45
+ uint8_t iseq;
+}
+gf_gene_t;
+typedef struct
+{
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+ utr_t which;
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_utr_t;
+
+
+/*
+ Structures related to VCF output:
+
+ vcsq_t
+ information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
+
+ vcrec_t
+ single VCF record and csq tied to this record. (Haplotype can have multiple
+ consequences in several VCF records. Each record can have multiple consequences
+ from multiple haplotypes.)
+
+ csq_t
+ a top-level consequence tied to a haplotype
+
+ vbuf_t
+ pos2vbuf
+ VCF records with the same position clustered together for a fast lookup via pos2vbuf
+*/
+typedef struct _vbuf_t vbuf_t;
+typedef struct _vcsq_t vcsq_t;
+struct _vcsq_t
+{
+ uint32_t strand:1,
+ type:31; // one of CSQ_* types
+ uint32_t trid;
+ uint32_t biotype; // one of GF_* types
+ char *gene; // gene name
+ bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
+ kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T
+};
+typedef struct
+{
+ bcf1_t *line;
+ uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved
+ uint32_t nfmt:4, nvcsq:28, mvcsq;
+ vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
+}
+vrec_t;
+typedef struct
+{
+ uint32_t pos;
+ vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf)
+ int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ
+ vcsq_t type;
+}
+csq_t;
+struct _vbuf_t
+{
+ vrec_t **vrec; // buffer of VCF lines with the same position
+ int n, m;
+};
+KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*)
+
+
+/*
+ Structures related to haplotype-aware consequences in coding regions
+
+ hap_node_t
+ node of a haplotype tree. Each transcript has one tree
+
+ tscript_t
+ despite its general name, it is intended for coding transcripts only
+
+ hap_t
+ hstack_t
+ for traversal of the haplotype tree and braking combined
+ consequences into independent parts
+*/
+typedef struct _hap_node_t hap_node_t;
+struct _hap_node_t
+{
+ char *seq; // cds segment [parent_node,this_node)
+ char *var; // variant "ref>alt"
+ uint32_t type:2, // HAP_ROOT or HAP_CDS
+ csq:30; // this node's consequence
+ int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution
+ uint32_t rbeg; // variant's VCF position (0-based, inclusive)
+ int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types
+ uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included)
+ uint32_t icds; // which exon does this node's variant overlaps
+ hap_node_t **child, *prev; // children haplotypes and previous coding node
+ int nchild, mchild;
+ bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
+ uint32_t nend; // number of haplotypes ending in this node
+ int *cur_child, mcur_child; // mapping from the allele to the currently active child
+ csq_t *csq_list; // list of haplotype's consequences, broken by position
+ int ncsq_list, mcsq_list;
+};
+struct _tscript_t
+{
+ uint32_t id; // transcript id
+ uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+ uint32_t strand:1, // STRAND_REV or STRAND_FWD
+ ncds:31, // number of exons
+ mcds;
+ gf_cds_t **cds; // ordered list of exons
+ char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
+ char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
+ hap_node_t *root; // root of the haplotype tree
+ hap_node_t **hap; // pointer to haplotype leaves, two for each sample
+ int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
+ uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
+ type:30; // one of GF_* types
+ gf_gene_t *gene;
+};
+static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+{
+ return ( (*a)->end < (*b)->end ) ? 1 : 0;
+}
+KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+typedef khp_trhp_t tr_heap_t;
+typedef struct
+{
+ hap_node_t *node; // current node
+ int ichild; // current child in the active node
+ int dlen; // total dlen, from the root to the active node
+ size_t slen; // total sequence length, from the root to the active node
+}
+hstack_t;
+typedef struct
+{
+ int mstack;
+ hstack_t *stack;
+ tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ kstring_t sseq; // spliced haplotype sequence on ref strand
+ kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
+ kstring_t tref; // the variable part of translated reference transcript, coding strand
+ uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
+ int upstream_stop;
+}
+hap_t;
+
+
+/*
+ Helper structures, only for initialization
+
+ ftr_t
+ temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
+KHASH_MAP_INIT_INT(int2int, int)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+ int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+ uint32_t beg;
+ uint32_t end;
+ uint32_t trid;
+ uint32_t strand:1; // STRAND_REV,STRAND_FWD
+ uint32_t phase:2; // 0, 1 or 2
+ uint32_t iseq:29;
+}
+ftr_t;
+typedef struct
+{
+ // all exons, CDS, UTRs
+ ftr_t *ftr;
+ int nftr, mftr;
+
+ // mapping from transcript ensembl id to gene id
+ kh_int2gene_t *gid2gene;
+
+ // mapping from transcript id to tscript, for quick CDS anchoring
+ kh_int2tscript_t *id2tr;
+
+ // sequences
+ void *seq2int;
+ char **seq;
+ int nseq, mseq;
+
+ // ignored biotypes
+ void *ignored_biotypes;
+}
+aux_t;
+
+typedef struct _args_t
+{
+ // the main regidx lookups, from chr:beg-end to overlapping features and
+ // index iterator
+ regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+ regitr_t *itr;
+
+ // temporary structures, deleted after initializtion
+ aux_t init;
+
+ // text tab-delimited output (out) or vcf/bcf output (out_fh)
+ FILE *out;
+ htsFile *out_fh;
+
+ // vcf
+ bcf_srs_t *sr;
+ bcf_hdr_t *hdr;
+ int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values()
+
+ // include or exclude sites which match the filters
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE
+
+ // samples to process
+ int sample_is_file;
+ char *sample_list;
+ smpl_ilist_t *smpl;
+
+ char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
+ char *bcsq_tag;
+ int argc, output_type;
+ int phase, quiet, local_csq;
+ int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
+ int ncsq_small_warned;
+
+ int rid; // current chromosome
+ tr_heap_t *active_tr; // heap of active transcripts for quick flushing
+ hap_t *hap; // transcript haplotype recursion
+ vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
+ rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
+ kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
+ tscript_t **rm_tr; // buffer of transcripts to clean
+ int nrm_tr, mrm_tr;
+ csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
+ int ncsq_buf, mcsq_buf;
+
+ faidx_t *fai;
+ kstring_t str, str2;
+ int32_t *gt_arr, mgt_arr;
+}
+args_t;
+
+// AAA, AAC, ...
+const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+const uint8_t nt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3
+};
+const uint8_t cnt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0
+};
+#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+
+static const char *gf_strings_noncoding[] =
+{
+ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+ "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
+ "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+ "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+const char *gf_type2gff_string(int type)
+{
+ if ( !GF_is_coding(type) )
+ {
+ if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+ type &= (1<<(GF_coding_bit+1)) - 1;
+ return gf_strings_special[type - 1];
+ }
+ type &= (1<<GF_coding_bit) - 1;
+ return gf_strings_coding[type - 1];
+}
+
+/*
+ gff parsing functions
+*/
+static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
+{
+ aux_t *aux = &args->init;
+ char c = chr_end[1];
+ chr_end[1] = 0;
+ int iseq;
+ if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ {
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = strdup(chr_beg);
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 256 ); // see gf_gene_t.iseq
+ }
+ chr_end[1] = c;
+ return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+ while ( *ss && *ss!='\t' ) ss++;
+ if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return ss+1;
+}
+static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
+{
+ char *se = (char*) line;
+ while ( *se && *se!='\t' ) se++;
+ if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ *chr_beg = (char*) line;
+ *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+ char *se = ss;
+ *beg = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+ ss = se+1;
+ *end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return se+1;
+}
+static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ while ( *ss && !isdigit(*ss) ) ss++;
+ if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ char *se;
+ uint32_t id = strtol(ss, &se, 10);
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
+ return id;
+}
+static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ char *se = ss;
+ while ( *se && !isdigit(*se) ) se++;
+ kstring_t str = {0,0,0};
+ kputsn(ss,se-ss,&str);
+ ss = se;
+ while ( *se && isdigit(*se) ) se++;
+ ksprintf(&str,"%%0%dd",(int)(se-ss));
+ ENSID_FMT = str.s;
+}
+static inline int gff_parse_type(char *line)
+{
+ line = strstr(line,"ID=");
+ if ( !line ) return -1;
+ line += 3;
+ if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
+ else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
+ return -1;
+}
+static inline int gff_parse_biotype(char *_line)
+{
+ char *line = strstr(_line,"biotype=");
+ if ( !line ) return -1;
+
+ line += 8;
+ switch (*line)
+ {
+ case 'p':
+ if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+ else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+ else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+ else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+ break;
+ case 'a':
+ if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+ else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+ else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+ break;
+ case 'I':
+ if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
+ else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
+ else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
+ else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
+ else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
+ else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+ break;
+ case 'T':
+ if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
+ else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
+ else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
+ else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
+ else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+ break;
+ case 'M':
+ if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+ break;
+ case 'l':
+ if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+ break;
+ case 'm':
+ if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+ else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+ else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+ break;
+ case 'r':
+ if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+ else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+ else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+ else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+ break;
+ case 's':
+ if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+ else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+ else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+ else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+ else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+ else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+ else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+ break;
+ case 't':
+ if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+ break;
+ case 'n':
+ if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+ else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+ break;
+ case 'k':
+ if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+ break;
+ case 'u':
+ if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+ break;
+ case 'L':
+ if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+ break;
+ case '3':
+ if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ break;
+ case 'd':
+ if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+ break;
+ case 'v':
+ if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+ break;
+ case 'b':
+ if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+ break;
+ }
+ return 0;
+}
+static inline int gff_ignored_biotype(args_t *args, char *ss)
+{
+ ss = strstr(ss,"biotype=");
+ if ( !ss ) return 0;
+
+ ss += 8;
+ char *se = ss, tmp;
+ while ( *se && *se!=';' ) se++;
+ tmp = *se;
+ *se = 0;
+
+ char *key = ss;
+ int n = 0;
+ if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+ khash_str2int_set(args->init.ignored_biotypes, key, n+1);
+
+ *se = tmp;
+ return 1;
+}
+gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+ khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+ gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+ if ( !gene )
+ {
+ gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+ int ret;
+ k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+ kh_val(aux->gid2gene,k) = gene;
+ }
+ return gene;
+}
+void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored transcript: %s\n",line);
+ return;
+ }
+
+ // create a mapping from transcript_id to gene_id
+ uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
+
+ if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+
+ tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
+ tr->id = trid;
+ tr->strand = ftr->strand;
+ tr->gene = gene_init(aux, gene_id);
+ tr->type = biotype;
+ tr->beg = ftr->beg;
+ tr->end = ftr->end;
+
+ khint_t k;
+ int ret;
+ k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+ kh_val(aux->id2tr,k) = tr;
+}
+void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
+{
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(stderr,"ignored gene: %s\n",line);
+ return;
+ }
+
+ aux_t *aux = &args->init;
+
+ // substring search for "ID=gene:ENSG00000437963"
+ uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ gf_gene_t *gene = gene_init(aux, gene_id);
+ assert( !gene->name ); // the gene_id should be unique
+
+ gene->iseq = feature_set_seq(args, chr_beg,chr_end);
+
+ // substring search for "Name=OR4F5"
+ ss = strstr(chr_end+2,"Name=");
+ if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+}
+int gff_parse(args_t *args, char *line, ftr_t *ftr)
+{
+ // - skip empty lines and commented lines
+ // - columns
+ // 1. chr
+ // 2. <skip>
+ // 3. CDS, transcript, gene, ...
+ // 4-5. beg,end
+ // 6. <skip>
+ // 7. strand
+ // 8. phase
+ // 9. Parent=transcript:ENST(\d+);ID=... etc
+
+ char *ss = line;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *chr_beg, *chr_end;
+ gff_parse_chr(line, &chr_beg, &chr_end);
+ ss = gff_skip(line, chr_end + 2);
+
+ // 3. column: is this a CDS, transcript, gene, etc.
+ if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+ else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+ else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+ else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+ else
+ {
+ ss = gff_skip(line, ss);
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+ int type = gff_parse_type(ss);
+ if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
+ {
+ // we ignore these, debug print to see new types:
+ ss = strstr(ss,"ID=");
+ if ( !ss ) return -1; // no ID, ignore the line
+ if ( !strncmp("chromosome",ss+3,10) ) return -1;
+ if ( !strncmp("supercontig",ss+3,11) ) return -1;
+ if ( args->quiet<2 ) fprintf(stderr,"ignored: %s\n", line);
+ return -1;
+ }
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else error("Unknown strand: %c .. %s\n", *ss,ss);
+
+ if ( type==GFF_TSCRIPT_LINE )
+ gff_parse_transcript(args, line, ss, ftr);
+ else
+ gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
+
+ return -1;
+ }
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
+ ss += 2;
+
+ // 8. column: phase (codon offset)
+ if ( *ss == '0' ) ftr->phase = 0;
+ else if ( *ss == '1' ) ftr->phase = 1;
+ else if ( *ss == '2' ) ftr->phase = 2;
+ else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase
+ else { if ( args->quiet<2 ) fprintf(stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
+ ss += 2;
+
+ // substring search for "Parent=transcript:ENST00000437963"
+ ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
+ return 0;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+ // comparison function for qsort of transcripts's CDS
+ if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+ if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+ return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+ *chr_beg = *chr_end = aux->seq[iseq];
+ while ( (*chr_end)[1] ) (*chr_end)++;
+}
+tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+ tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+ assert( tr );
+ return tr;
+}
+void register_cds(args_t *args, ftr_t *ftr)
+{
+ // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+ // ftr is the result of parsing a gff CDS line
+ aux_t *aux = &args->init;
+
+ tscript_t *tr = tscript_init(aux, ftr->trid);
+ if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+ gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+ cds->tr = tr;
+ cds->beg = ftr->beg;
+ cds->len = ftr->end - ftr->beg + 1;
+ cds->icds = 0; // to keep valgrind on mac happy
+ cds->phase = ftr->phase;
+
+ hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+ tr->cds[tr->ncds++] = cds;
+}
+void register_utr(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+ utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+ utr->beg = ftr->beg;
+ utr->end = ftr->end;
+ utr->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+void register_exon(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+ exon->beg = ftr->beg;
+ exon->end = ftr->end;
+ exon->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+void tscript_init_cds(args_t *args)
+{
+ aux_t *aux = &args->init;
+
+ // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+ khint_t k;
+ for (k=0; k<kh_end(aux->id2tr); k++)
+ {
+ if ( !kh_exist(aux->id2tr, k) ) continue;
+ tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
+
+ // position-to-tscript lookup
+ char *chr_beg, *chr_end;
+ chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+ if ( !tr->ncds ) continue; // transcript with no CDS
+
+ // sort CDs
+ qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+ // trim non-coding start
+ int i, len = 0;
+ if ( tr->strand==STRAND_FWD )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+
+ // sanity check phase
+ for (i=0; i<tr->ncds; i++)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ assert( phase == len%3 );
+ len += tr->cds[i]->len;
+ }
+ }
+ else
+ {
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
+ tr->cds[i]->phase = 0;
+
+ // sanity check phase
+ for (i=tr->ncds-1; i>=0; i--)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ len += tr->cds[i]->len;
+ }
+ }
+
+ // set len. At the same check that CDS within a transcript do not overlap
+ len = 0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->icds = i;
+ len += tr->cds[i]->len;
+ if ( !i ) continue;
+
+ gf_cds_t *a = tr->cds[i-1];
+ gf_cds_t *b = tr->cds[i];
+ if ( a->beg + a->len - 1 >= b->beg )
+ error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n",
+ kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
+ if ( len%3 != 0 )
+ {
+ // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+ // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+ // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+ tr->trim |= TRIM_3PRIME;
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = tr->ncds - 1;
+ while ( i>=0 && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ len -= dlen;
+ i--;
+ }
+ }
+ else
+ {
+ i = 0;
+ while ( i<tr->ncds && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ tr->cds[i]->beg += dlen;
+ len -= dlen;
+ i++;
+ }
+ }
+ }
+
+ // set CDS offsets and insert into regidx
+ len=0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->pos = len;
+ len += tr->cds[i]->len;
+ regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+ }
+ }
+}
+
+void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
+
+void init_gff(args_t *args)
+{
+ aux_t *aux = &args->init;
+ aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
+ aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
+ args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
+ aux->ignored_biotypes = khash_str2int_init();
+
+ // parse gff
+ kstring_t str = {0,0,0};
+ htsFile *fp = hts_open(args->gff_fname,"r");
+ if ( !fp ) error("Failed to read %s\n", args->gff_fname);
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+ int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
+ if ( !ret ) aux->nftr++;
+ }
+ free(str.s);
+ if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
+
+
+ // process gff information: connect CDS and exons to transcripts
+ args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+ args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+ args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+ args->itr = regitr_init(NULL);
+
+ int i;
+ for (i=0; i<aux->nftr; i++)
+ {
+ ftr_t *ftr = &aux->ftr[i];
+
+ // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+ if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
+
+ tscript_t *tr = kh_val(aux->id2tr,k);
+ if ( !tr->gene->name )
+ {
+ // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
+ regidx_free_tscript(&tr);
+ kh_del(int2tscript, aux->id2tr,k);
+ continue;
+ }
+
+ // populate regidx by category:
+ // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+ // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+ if ( ftr->type==GF_CDS ) register_cds(args, ftr);
+ else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
+ else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
+ else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
+ else
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ }
+ tscript_init_cds(args);
+
+ if ( !args->quiet )
+ {
+ fprintf(stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ regidx_nregs(args->idx_tscript),
+ regidx_nregs(args->idx_exon),
+ regidx_nregs(args->idx_cds),
+ regidx_nregs(args->idx_utr));
+ }
+
+ free(aux->ftr);
+ khash_str2int_destroy_free(aux->seq2int);
+ // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+ kh_destroy(int2tscript,aux->id2tr);
+ free(aux->seq);
+
+ if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
+ {
+ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+ fprintf(stderr,"Ignored the following biotypes:\n");
+ for (i = kh_begin(ign); i < kh_end(ign); i++)
+ {
+ if ( !kh_exist(ign,i)) continue;
+ fprintf(stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i));
+ }
+ }
+ khash_str2int_destroy_free(aux->ignored_biotypes);
+}
+
+void init_data(args_t *args)
+{
+ args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32;
+
+ if ( !args->quiet ) fprintf(stderr,"Parsing %s ...\n", args->gff_fname);
+ init_gff(args);
+
+ args->rid = -1;
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ args->fai = fai_load(args->fa_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
+
+ args->pos2vbuf = kh_init(pos2vbuf);
+ args->active_tr = khp_init(trhp);
+ args->hap = (hap_t*) calloc(1,sizeof(hap_t));
+
+ // init samples
+ if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT;
+ if ( args->sample_list && !strcmp("-",args->sample_list) )
+ {
+ // ignore all samples
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ // significant speedup for plain VCFs
+ bcf_hdr_set_samples(args->hdr,NULL,0);
+ }
+ args->phase = PHASE_DROP_GT;
+ }
+ else
+ args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT);
+ args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr);
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ args->out = args->output_fname ? fopen(args->output_fname,"w") : stdout;
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno));
+
+ fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++)
+ fprintf(args->out," %s",args->argv[i]);
+ fprintf(args->out,"\n");
+ fprintf(args->out,"# LOG\t[2]Message\n");
+ fprintf(args->out,"# CSQ"); i = 1;
+ fprintf(args->out,"\t[%d]Sample", ++i);
+ fprintf(args->out,"\t[%d]Haplotype", ++i);
+ fprintf(args->out,"\t[%d]Chromosome", ++i);
+ fprintf(args->out,"\t[%d]Position", ++i);
+ fprintf(args->out,"\t[%d]Consequence", ++i);
+ fprintf(args->out,"\n");
+ }
+ else
+ {
+ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
+ bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
+ bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq. Format: '[*]consequence|gene|transcript|biotype[|strand|amino_acid_change|dna_change]' or, for consequences of variants split across multiple sites, a pointer to the record storing the consequences '@position'. '*' prefix indicates a consequence downstream from a stop \">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
+ if ( args->hdr_nsmpl )
+ bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
+ bcf_hdr_write(args->out_fh, args->hdr);
+ }
+ if ( !args->quiet ) fprintf(stderr,"Calling...\n");
+}
+
+void destroy_data(args_t *args)
+{
+ regidx_destroy(args->idx_cds);
+ regidx_destroy(args->idx_utr);
+ regidx_destroy(args->idx_exon);
+ regidx_destroy(args->idx_tscript);
+ regitr_destroy(args->itr);
+
+ khint_t k,i,j;
+ for (k=0; k<kh_end(args->init.gid2gene); k++)
+ {
+ if ( !kh_exist(args->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
+ free(gene->name);
+ free(gene);
+ }
+ kh_destroy(int2gene,args->init.gid2gene);
+
+ if ( args->filter )
+ filter_destroy(args->filter);
+
+ khp_destroy(trhp,args->active_tr);
+ kh_destroy(pos2vbuf,args->pos2vbuf);
+ if ( args->smpl ) smpl_ilist_destroy(args->smpl);
+ int ret;
+ if ( args->out_fh )
+ ret = hts_close(args->out_fh);
+ else
+ ret = fclose(args->out);
+ if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
+ for (i=0; i<args->vcf_rbuf.m; i++)
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ if ( !vbuf ) continue;
+ for (j=0; j<vbuf->m; j++)
+ {
+ if ( !vbuf->vrec[j] ) continue;
+ if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
+ free(vbuf->vrec[j]->smpl);
+ free(vbuf->vrec[j]->vcsq);
+ free(vbuf->vrec[j]);
+ }
+ free(vbuf->vrec);
+ free(vbuf);
+ }
+ free(args->vcf_buf);
+ free(args->rm_tr);
+ free(args->csq_buf);
+ free(args->hap->stack);
+ free(args->hap->sseq.s);
+ free(args->hap->tseq.s);
+ free(args->hap->tref.s);
+ free(args->hap);
+ fai_destroy(args->fai);
+ free(args->gt_arr);
+ free(args->str.s);
+ free(args->str2.s);
+ free(ENSID_FMT);
+}
+
+/*
+ The splice_* functions are for consquences around splice sites: start,stop,splice_*
+ */
+#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
+#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
+#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
+#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
+typedef struct
+{
+ tscript_t *tr;
+ struct {
+ int32_t pos, rlen, alen;
+ char *ref, *alt;
+ bcf1_t *rec;
+ } vcf;
+ uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
+ check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
+ check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
+ check_donor:1, // as with check_acceptor
+ check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
+ check_region_end:1, //
+ check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
+ set_refalt:1; // set kref,kalt, if set, check also for synonymous events
+ uint32_t csq;
+ int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
+ uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
+ ref_end; // a more conservative csq (the first and last base in kref.s)
+ kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
+}
+splice_t;
+void splice_init(splice_t *splice, bcf1_t *rec)
+{
+ memset(splice,0,sizeof(*splice));
+ splice->vcf.rec = rec;
+ splice->vcf.pos = rec->pos;
+ splice->vcf.rlen = rec->rlen;
+ splice->vcf.ref = rec->d.allele[0];
+}
+static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
+{
+ // len>0 .. beg is the first base, del filled from right
+ // len<0 .. beg is the last base, del filled from left
+
+ int rlen, alen, rbeg, abeg; // first base to include (ref coordinates)
+ if ( len<0 )
+ {
+ rlen = alen = -len;
+ rbeg = beg - rlen + 1;
+ int dlen = splice->vcf.alen - splice->vcf.rlen;
+ if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle
+ dlen += splice->ref_end - beg;
+ abeg = rbeg + dlen;
+ }
+ else
+ {
+ rbeg = abeg = beg;
+ rlen = alen = len;
+ // check for incomplete del as above??
+ }
+
+#define XDBG 0
+#if XDBG
+fprintf(stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
+#endif
+ splice->kref.l = 0;
+ splice->kalt.l = 0;
+
+ // add the part before vcf.ref, in the vcf.ref and after vcf.ref
+ int roff; // how many vcf.ref bases already used
+ if ( rbeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
+ kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ roff = 0;
+ }
+ else
+ roff = rbeg - splice->vcf.pos;
+#if XDBG
+fprintf(stderr,"r1: %s roff=%d\n",splice->kref.s,roff);
+#endif
+
+ if ( roff < splice->vcf.rlen && splice->kref.l < rlen )
+ {
+ int len = splice->vcf.rlen - roff; // len still available in vcf.ref
+ if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed
+ kputsn(splice->vcf.ref + roff, len, &splice->kref);
+ }
+#if XDBG
+fprintf(stderr,"r2: %s\n",splice->kref.s);
+#endif
+
+ uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kref.l < rlen )
+ {
+ if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
+ rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
+ if ( splice->kref.l < rlen )
+ kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ }
+#if XDBG
+fprintf(stderr,"r3: %s\n",splice->kref.s);
+#endif
+
+
+ int aoff;
+ if ( abeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= abeg );
+ kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ aoff = 0;
+ }
+ else
+ aoff = abeg - splice->vcf.pos;
+#if XDBG
+fprintf(stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ if ( aoff < splice->vcf.alen && splice->kalt.l < alen )
+ {
+ int len = splice->vcf.alen - aoff; // len still available in vcf.alt
+ if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed
+ kputsn(splice->vcf.alt + aoff, len, &splice->kalt);
+ aoff -= len;
+ }
+ if ( aoff < 0 ) aoff = 0;
+ else aoff--;
+#if XDBG
+fprintf(stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kalt.l < alen )
+ {
+ if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
+ alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
+ if ( alen > 0 && alen > splice->kalt.l )
+ kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ }
+#if XDBG
+fprintf(stderr,"a3: %s\n",splice->kalt.s);
+fprintf(stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s);
+#endif
+}
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid)
+{
+ while ( regitr_overlap(itr) )
+ {
+ gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
+ tscript_t *tr = utr->tr;
+ if ( tr->id != trid ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ return csq.type.type;
+ }
+ return 0;
+}
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type)
+{
+#if XDBG
+fprintf(stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
+#endif
+ if ( !type ) return;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = type;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+}
+static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
+ // before and after the inserted bases
+ if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] )
+ {
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+ else
+ {
+ if ( splice->tend ) splice->tend--;
+ splice->ref_beg = splice->vcf.pos;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+#if XDBG
+fprintf(stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ int ret;
+ if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_end ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_beg ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ // overlaps the exon or inside the exon
+ // possible todo: find better alignment for frameshifting variants?
+ if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 2 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ // Make sure the variant will not end up left aligned to avoid overlapping vcf records
+ // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ // splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right
+ {
+ int dlen = splice->vcf.pos - splice->ref_beg;
+ assert( dlen==1 );
+ splice->tbeg += dlen;
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen;
+ splice->ref_beg = splice->vcf.pos;
+ }
+ if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele
+ splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base
+
+#if XDBG
+fprintf(stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ // filling from the left does not work for ENST00000341065/frame3.vcf
+ // CAG.GTGGCCAG CAG.GTGGCCAG
+ // CA-.--GGCCAG vs CAG.---GCCAG
+ // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON);
+ //
+ // filling from the right:
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
+ splice->ref_beg = ex_beg - 1;
+ if ( splice->tbeg + splice->tend == splice->vcf.alen )
+ {
+ // the deletion overlaps ex_beg and cannot be easily realigned to the right
+ if ( !splice->tend )
+ {
+ splice->csq |= CSQ_CODING_SEQUENCE;
+ return SPLICE_OVERLAP;
+ }
+ splice->tend--;
+ }
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_beg < ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ if ( splice->tbeg>0 ) splice->tbeg--; //why is this?
+ if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->vcf.alen -= splice->tbeg + splice->tend;
+ }
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
+ if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
+ {
+ splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
+ return SPLICE_OVERLAP;
+ }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // not a real variant, can be ignored: eg ACGT>ACGT
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF;
+
+ splice->ref_beg = splice->vcf.pos + splice->tbeg;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;
+
+#if XDBG
+fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg < ex_beg ) // the part before the exon
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos;
+ splice->ref_beg = ex_beg;
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_beg <= ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 3 )
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ splice->csq = 0;
+ splice->vcf.alen = strlen(splice->vcf.alt);
+
+ int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0;
+ splice->tbeg = 0, splice->tend = 0;
+
+ // trim from the right, then from the left
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break;
+ i++;
+ }
+ splice->tend = i;
+ rlen1 -= i, alen1 -= i, i = 0;
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break;
+ i++;
+ }
+ splice->tbeg = i;
+
+ // The mnp, ins and del code was split into near-identical functions for clarity and debugging;
+ // possible todo: generalize once stable
+ if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
+
+ return 0;
+}
+
+// return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref)
+int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial)
+{
+ int i;
+ kstring_t str = {0,0,0};
+ tscript_t *tr = cds->tr;
+ child->icds = cds->icds; // index of cds in the tscript's list of exons
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.tr = tr;
+ splice.vcf.alt = rec->d.allele[ial];
+ splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
+ if ( !(tr->trim & TRIM_5PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
+ else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
+ }
+ if ( !(tr->trim & TRIM_3PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
+ else { if ( child->icds==0 ) splice.check_stop = 1; }
+ }
+ if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ }
+ if ( child->icds!=0 ) splice.check_region_beg = 1;
+ if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
+
+#if XDBG
+fprintf(stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop);
+#endif
+ int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1);
+#if XDBG
+fprintf(stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq);
+#endif
+
+ if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq
+ {
+ free(splice.kref.s);
+ free(splice.kalt.s);
+
+ if ( !splice.csq ) return 2; // fully intronic, no csq
+
+ // splice_region/acceptor/donor
+ child->seq = NULL;
+ child->sbeg = 0;
+ child->rbeg = rec->pos;
+ child->rlen = 0;
+ child->dlen = 0;
+ kputs(rec->d.allele[0],&str);
+ kputc('>',&str);
+ kputs(rec->d.allele[ial],&str);
+ child->var = str.s;
+ child->type = HAP_SSS;
+ child->csq = splice.csq;
+ child->prev = parent->type==HAP_SSS ? parent->prev : parent;
+ child->rec = rec;
+ return 0;
+ }
+ if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice
+
+ int dbeg = 0;
+ if ( splice.ref_beg < cds->beg )
+ {
+ // The vcf record overlaps the exon boundary, but the variant itself
+ // should fit inside since we are here. This will need more work.
+ // #1475227917
+ dbeg = cds->beg - splice.ref_beg;
+ splice.kref.l -= dbeg;
+ splice.ref_beg = cds->beg;
+ assert( dbeg <= splice.kalt.l );
+ }
+
+ if ( parent->type==HAP_SSS ) parent = parent->prev;
+ if ( parent->type==HAP_CDS )
+ {
+ i = parent->icds;
+ if ( i!=cds->icds )
+ {
+ // the variant is on a new exon, finish up the previous
+ int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
+ if ( len > 0 )
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+
+ // append any skipped non-variant exons
+ while ( ++i < cds->icds )
+ kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+
+ if ( parent->icds==child->icds )
+ {
+ int len = splice.ref_beg - parent->rbeg - parent->rlen;
+ if ( len < 0 ) // overlapping variants
+ {
+ free(str.s);
+ return 1;
+ }
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+ else
+ kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ }
+ kputs(splice.kalt.s + dbeg, &str);
+
+ child->seq = str.s;
+ child->sbeg = cds->pos + (splice.ref_beg - cds->beg);
+ child->rbeg = splice.ref_beg;
+ child->rlen = splice.kref.l;
+ child->type = HAP_CDS;
+ child->prev = parent;
+ child->rec = rec;
+ child->csq = splice.csq;
+
+ // set vlen and the "ref>alt" string
+ {
+ int rlen = strlen(rec->d.allele[0]);
+ int alen = strlen(rec->d.allele[ial]);
+ child->dlen = alen - rlen;
+ child->var = (char*) malloc(rlen+alen+2);
+ memcpy(child->var,rec->d.allele[0],rlen);
+ child->var[rlen] = '>';
+ memcpy(child->var+rlen+1,rec->d.allele[ial],alen);
+ child->var[rlen+alen+1] = 0;
+ }
+
+ // yuck, the whole CDS is modified/deleted, not ready for this, todo.
+ if ( child->rbeg + child->rlen > cds->beg + cds->len )
+ {
+ child->type = HAP_SSS;
+ if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf
+ }
+
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return 0;
+}
+void hap_destroy(hap_node_t *hap)
+{
+ int i;
+ for (i=0; i<hap->nchild; i++)
+ if ( hap->child[i] ) hap_destroy(hap->child[i]);
+ for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s);
+ free(hap->csq_list);
+ free(hap->child);
+ free(hap->cur_child);
+ free(hap->seq);
+ free(hap->var);
+ free(hap);
+}
+
+
+/*
+ ref: spliced reference and its length (ref.l)
+ seq: part of the spliced query transcript on the reference strand to translate, its
+ length (seq.l) and the total length of the complete transcript (seq.m)
+ sbeg: seq offset within the spliced query transcript
+ rbeg: seq offset within ref, 0-based
+ rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l
+ strand: coding strand - 0:rev, 1:fwd
+ tseq: translated sequence (aa)
+ fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
+ */
+void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
+{
+#if XDBG
+fprintf(stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+#endif
+ char tmp[3], *codon, *end;
+ int i, len, npad;
+
+ kstring_t ref = *_ref;
+ kstring_t seq = *_seq;
+
+ tseq->l = 0;
+ if ( !seq.l )
+ {
+ kputc('?', tseq);
+ return;
+ }
+
+#define DBG 0
+#if DBG
+ fprintf(stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+ fprintf(stderr," ref: l=%d %s\n", (int)ref.l,ref.s);
+ fprintf(stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m);
+ for (i=0; i<seq.l; i++) fprintf(stderr,"%c",seq.s[i]); fprintf(stderr,"\n");
+ fprintf(stderr," sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend);
+ fprintf(stderr," strand,fill: %d,%d\n", strand,fill);
+#endif
+
+ if ( strand==STRAND_FWD )
+ {
+ // left padding
+ npad = sbeg % 3;
+#if DBG>1
+ fprintf(stderr," npad: %d\n",npad);
+#endif
+ assert( npad<=rbeg );
+
+ for (i=0; i<npad; i++)
+ tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD];
+ for (; i<3 && i-npad<seq.l; i++)
+ tmp[i] = seq.s[i-npad];
+ len = seq.l - i + npad; // the remaining length of padded sseq
+#if DBG>1
+ fprintf(stderr,"\t i=%d\n", i);
+#endif
+ if ( i==3 )
+ {
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ codon = seq.s + 3 - npad; // next codon
+ end = codon + len - 1 - (len % 3); // last position of a valid codon
+ while ( codon < end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
+#endif
+ codon += 3;
+ }
+ end = seq.s + seq.l - 1;
+ for (i=0; codon+i<=end; i++) tmp[i] = codon[i];
+ }
+
+ // right padding
+ codon = ref.s + rend + N_REF_PAD;
+ if ( i>0 )
+ {
+#if DBG>1
+ if(i==1)fprintf(stderr,"[3]%c\n",tmp[0]);
+ if(i==2)fprintf(stderr,"[3]%c%c\n",tmp[0],tmp[1]);
+#endif
+ for (; i<3; i++)
+ {
+ tmp[i] = *codon;
+ codon++;
+ }
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ }
+ if ( fill!=0 )
+ {
+ end = ref.s + ref.l - N_REF_PAD;
+ while ( codon+3 <= end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
+#endif
+ codon += 3;
+ }
+ }
+ }
+ else // STRAND_REV
+ {
+ // right padding - number of bases to take from ref
+ npad = (seq.m - (sbeg + seq.l)) % 3;
+#if DBG>1
+ fprintf(stderr," npad: %d\n",npad);
+#endif
+if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m);
+ assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand
+
+ if ( npad==2 )
+ {
+ tmp[1] = ref.s[rend+N_REF_PAD];
+ tmp[2] = ref.s[rend+N_REF_PAD+1];
+ i = 0;
+ }
+ else if ( npad==1 )
+ {
+ tmp[2] = ref.s[rend+N_REF_PAD];
+ i = 1;
+ }
+ else
+ i = 2;
+
+ end = seq.s + seq.l;
+ for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end);
+#if DBG>1
+ fprintf(stderr,"\t i=%d\n", i);
+ if(i==1)fprintf(stderr,"[0] %c\n",tmp[2]);
+ if(i==0)fprintf(stderr,"[0] %c%c\n",tmp[1],tmp[2]);
+#endif
+ if ( i==-1 )
+ {
+#if DBG>1
+ fprintf(stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
+#endif
+ kputc_(cdna2aa(tmp), tseq);
+ codon = end - 3;
+ while ( codon >= seq.s )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ if ( seq.s-codon==2 )
+ {
+ tmp[2] = seq.s[0];
+ i = 1;
+ }
+ else if ( seq.s-codon==1 )
+ {
+ tmp[1] = seq.s[0];
+ tmp[2] = seq.s[1];
+ i = 0;
+ }
+ else
+ i = -1;
+#if DBG>1
+ if(i==1)fprintf(stderr,"[3] %c\n",tmp[2]);
+ if(i==0)fprintf(stderr,"[3] %c%c\n",tmp[1],tmp[2]);
+#endif
+ }
+ // left padding
+ end = ref.s + N_REF_PAD + rbeg;
+ if ( i>=0 )
+ {
+ for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
+ kputc_(cdna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
+#endif
+ }
+ if ( fill!=0 )
+ {
+ codon = end - 3;
+ while ( codon >= ref.s + N_REF_PAD )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ }
+ }
+ kputc_(0,tseq); tseq->l--;
+#if DBG
+ fprintf(stderr," tseq: %s\n", tseq->s);
+#endif
+}
+
+void tscript_splice_ref(tscript_t *tr)
+{
+ int i, len = 0;
+ for (i=0; i<tr->ncds; i++)
+ len += tr->cds[i]->len;
+
+ tr->nsref = len + 2*N_REF_PAD;
+ tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ len = 0;
+
+ memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ for (i=0; i<tr->ncds; i++)
+ {
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ len += tr->cds[i]->len;
+ }
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ tr->sref[len] = 0;
+}
+
+// returns: 0 if consequence was added, 1 if it already exists or could not be added
+int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+#if XDBG
+fprintf(stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
+#endif
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
+ vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
+ if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+
+ int i;
+ for (i=0; i<vbuf->n; i++)
+ if ( vbuf->vrec[i]->line==rec ) break;
+ if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+ vrec_t *vrec = vbuf->vrec[i];
+
+ // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
+ if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
+ csq->type.type &= ~CSQ_SPLICE_REGION;
+
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ // Same as below, to avoid records like
+ // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i] = csq->type;
+ goto exit_duplicate;
+ }
+ if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue;
+ if ( csq->type.ref != vrec->vcsq[i].ref ) continue;
+ goto exit_duplicate;
+ }
+ }
+ else if ( csq->type.type & CSQ_COMPOUND )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
+ if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
+ {
+ // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
+ // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
+ // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two
+ // consequences:
+ // stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
+ if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
+ {
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+
+ // remove stop_lost&synonymous if stop_retained set
+ if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
+ vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
+
+ if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
+ goto exit_duplicate;
+ }
+ continue;
+ }
+ if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
+ }
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ }
+ else
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate;
+ }
+ }
+ // no such csq yet in this vcf record
+ csq->vrec = vrec;
+ csq->idx = i;
+ vrec->nvcsq++;
+ hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
+ vrec->vcsq[i] = csq->type;
+ return 0;
+
+exit_duplicate:
+ csq->vrec = vrec;
+ csq->idx = i;
+ return 1;
+}
+
+// soff .. position of the variant within the trimmed query transcript
+// sbeg .. position of the variant within the query transcript
+// rbeg .. position on the reference transcript (if there are no indels, then rbeg=send)
+// rpos .. VCF position
+#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen))
+#define node2sbeg(i) (hap->sbeg + node2soff(i))
+#define node2send(i) (hap->sbeg + hap->stack[i].slen)
+#define node2rbeg(i) (hap->stack[i].node->sbeg)
+#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
+#define node2rpos(i) (hap->stack[i].node->rec->pos)
+
+void kput_vcsq(vcsq_t *csq, kstring_t *str)
+{
+ // Remove start/stop from incomplete CDS, but only if there is another
+ // consequence as something must be reported
+ if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS);
+
+ // Remove missense from start/stops
+ if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT;
+
+ if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref )
+ {
+ kputc_('@',str);
+ kputw(csq->ref->pos+1, str);
+ return;
+ }
+ if ( csq->type & CSQ_UPSTREAM_STOP )
+ kputc_('*',str);
+
+ int i, n = sizeof(csq_strings)/sizeof(char*);
+ for (i=1; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+ i++;
+ for (; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+
+ kputc_('|', str);
+ if ( csq->gene ) kputs(csq->gene , str);
+
+ kputc_('|', str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+
+ kputc_('|', str);
+ kputs(gf_type2gff_string(csq->biotype), str);
+
+ if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
+ kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
+
+ if ( csq->vstr.l )
+ kputs(csq->vstr.s, str);
+}
+
+void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
+{
+ int i;
+ tscript_t *tr = hap->tr;
+ int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
+
+ int icsq = node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *csq = &node->csq_list[icsq];
+ csq->pos = hap->stack[ref_node].node->rec->pos;
+ csq->type.trid = tr->id;
+ csq->type.gene = tr->gene->name;
+ csq->type.strand = tr->strand;
+ csq->type.biotype = tr->type;
+
+ // only now we see the translated sequence and can determine if the stop/start changes are real
+ int rm_csq = 0;
+ csq->type.type = 0;
+ for (i=ibeg; i<=iend; i++)
+ csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
+ if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING;
+
+ int has_upstream_stop = hap->upstream_stop;
+ if ( hap->stack[ibeg].node->type != HAP_SSS )
+ {
+ // check for truncating stops
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i]=='*' ) break;
+ if ( i!=hap->tref.l )
+ {
+ hap->tref.l = i+1;
+ hap->tref.s[i+1] = 0;
+ }
+ for (i=0; i<hap->tseq.l; i++)
+ if ( hap->tseq.s[i]=='*' ) break;
+ if ( i!=hap->tseq.l )
+ {
+ hap->tseq.l = i+1;
+ hap->tseq.s[i+1] = 0;
+ hap->upstream_stop = 1;
+ }
+ if ( csq->type.type & CSQ_STOP_LOST )
+ {
+ if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ {
+ rm_csq |= CSQ_STOP_LOST;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else if ( hap->tref.s[hap->tref.l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( hap->tseq.s[hap->tseq.l-1] == '*' )
+ {
+ rm_csq |= CSQ_STOP_GAINED;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq->type.type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
+ {
+ rm_csq |= CSQ_START_LOST;
+ csq->type.type &= ~CSQ_START_LOST;
+ }
+ if ( dlen!=0 )
+ {
+ if ( dlen%3 )
+ csq->type.type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( dlen<0 )
+ csq->type.type |= CSQ_INFRAME_DELETION;
+ else
+ csq->type.type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
+ if ( i==hap->tref.l )
+ csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( hap->tref.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_LOST;
+ else if ( hap->tseq.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_GAINED;
+ else
+ csq->type.type |= CSQ_MISSENSE_VARIANT;
+ }
+ }
+ if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
+ csq->type.type &= ~rm_csq;
+
+ if ( hap->stack[ibeg].node->type == HAP_SSS )
+ {
+ node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq;
+ node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec;
+ node->csq_list[icsq].type.biotype = tr->type;
+ csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec);
+ return;
+ }
+
+ kstring_t str = node->csq_list[icsq].type.vstr;
+ str.l = 0;
+
+ // create the aa variant string
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(hap->tref.s, &str);
+ if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(hap->tseq.s, &str);
+ }
+ kputc_('|', &str);
+
+ // create the dna variant string and, in case of combined variants,
+ // insert silent CSQ_PRINTED_UPSTREAM variants
+ for (i=ibeg; i<=iend; i++)
+ {
+ if ( i>ibeg ) kputc_('+', &str);
+ kputw(node2rpos(i)+1, &str);
+ kputs(hap->stack[i].node->var, &str);
+ }
+ node->csq_list[icsq].type.vstr = str;
+ csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec);
+
+ for (i=ibeg; i<=iend; i++)
+ {
+ // csq are printed at one position only for combined variants, the rest is
+ // silent and references the first
+ if ( hap->stack[i].node->csq & ~CSQ_COMPOUND )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.vstr.l = 0;
+ kputs(str.s,&tmp_csq->type.vstr);
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.ref = hap->stack[ref_node].node->rec;
+ tmp_csq->type.vstr.l = 0;
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ }
+}
+
+void hap_finalize(args_t *args, hap_t *hap)
+{
+ tscript_t *tr = hap->tr;
+ if ( !tr->sref )
+ tscript_splice_ref(tr);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ int istack = 0;
+ hts_expand(hstack_t,1,hap->mstack,hap->stack);
+
+ hap->sseq.l = 0;
+ hap->tseq.l = 0;
+ hap->stack[0].node = tr->root;
+ hap->stack[0].ichild = -1;
+ hap->stack[0].slen = 0;
+ hap->stack[0].dlen = 0;
+
+ while ( istack>=0 )
+ {
+ hstack_t *stack = &hap->stack[istack];
+ hap_node_t *node = hap->stack[istack].node;
+ while ( ++hap->stack[istack].ichild < node->nchild )
+ {
+ if ( node->child[stack->ichild] ) break;
+ }
+ if ( stack->ichild == node->nchild ) { istack--; continue; }
+
+ node = node->child[stack->ichild];
+
+ istack++;
+ hts_expand(hstack_t,istack+1,hap->mstack,hap->stack);
+ stack = &hap->stack[istack-1];
+
+ hap->stack[istack].node = node;
+ hap->stack[istack].ichild = -1;
+
+ hap->sseq.l = stack->slen;
+ if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq);
+ hap->stack[istack].slen = hap->sseq.l;
+ hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen;
+
+ if ( !node->nend ) continue; // not a leaf node
+
+ // The spliced sequence has been built for the current haplotype and stored
+ // in hap->sseq. Now we break it and output as independent parts
+
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
+ hap->upstream_stop = 0;
+
+ int i = 1, dlen = 0, ibeg, indel = 0;
+ while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
+ hap->sbeg = hap->stack[i].node->sbeg;
+
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = 0, ibeg = -1;
+ while ( ++i <= istack )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ // start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i<istack )
+ {
+ if ( dlen%3 ) // frameshift
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = node2sbeg(i);
+ int inext = node2sbeg(i+1);
+ if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+
+ int ioff = node2soff(ibeg);
+ int icur = node2sbeg(ibeg);
+ int rbeg = node2rbeg(ibeg);
+ int rend = node2rend(i);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[i].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(i) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ else
+ {
+ i = istack + 1, ibeg = -1;
+ while ( --i > 0 )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
+ {
+ if ( dlen%3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = sseq.m - 1 - node2sbeg(i);
+ int inext = sseq.m - 1 - node2sbeg(i-1);
+ if ( icur/3 == inext/3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+ int ioff = node2soff(i);
+ int icur = node2sbeg(i);
+ int rbeg = node2rbeg(i);
+ int rend = node2rend(ibeg);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[ibeg].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(ibeg) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ }
+}
+
+static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
+{
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+}
+static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue;
+ assert( csq->type.vstr.l );
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+ }
+}
+
+static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list || ismpl<0 ) return;
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ vrec_t *vrec = csq->vrec;
+ int icsq = 2*csq->idx + ihap;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+}
+
+void hap_flush(args_t *args, uint32_t pos)
+{
+ int i,j;
+ tr_heap_t *heap = args->active_tr;
+
+ while ( heap->ndat && heap->dat[0]->end<=pos )
+ {
+ tscript_t *tr = heap->dat[0];
+ khp_delete(trhp, heap);
+
+ args->hap->tr = tr;
+ if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ {
+ hap_finalize(args, args->hap);
+
+ if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
+ {
+ if ( args->phase==PHASE_DROP_GT )
+ hap_print_text(args, tr, -1,0, tr->hap[0]);
+ else
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ }
+ }
+ }
+ else if ( args->phase!=PHASE_DROP_GT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ }
+ }
+ }
+
+ // mark the transcript for deletion. Cannot delete it immediately because
+ // by-position VCF output will need them when flushed by vcf_buf_push
+ args->nrm_tr++;
+ hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ args->rm_tr[args->nrm_tr-1] = tr;
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+
+void vbuf_push(args_t *args, bcf1_t **rec_ptr)
+{
+ int i;
+
+ assert(rec_ptr);
+ bcf1_t *rec = *rec_ptr;
+
+ // check for duplicate records
+ i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
+ if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
+ {
+ // vcf record with a new pos
+ rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
+ i = rbuf_append(&args->vcf_rbuf);
+ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t));
+ args->vcf_buf[i]->n = 0;
+ }
+ vbuf_t *vbuf = args->vcf_buf[i];
+ vbuf->n++;
+ hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec);
+ if ( !vbuf->vrec[vbuf->n - 1] )
+ vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t));
+
+ vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
+ if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
+ {
+ if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ }
+ if ( !vrec->line ) vrec->line = bcf_init1();
+ SWAP(bcf1_t*, (*rec_ptr), vrec->line);
+
+ int ret;
+ khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret);
+ kh_val(args->pos2vbuf,k) = vbuf;
+}
+
+void vbuf_flush(args_t *args)
+{
+ if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone
+
+ int i,j;
+ while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 )
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ for (i=0; i<vbuf->n; i++)
+ {
+ vrec_t *vrec = vbuf->vrec[i];
+ if ( !args->out_fh ) // not a VCF output
+ {
+ vrec->nvcsq = 0;
+ continue;
+ }
+ if ( !vrec->nvcsq )
+ {
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ continue;
+ }
+
+ args->str.l = 0;
+ kput_vcsq(&vrec->vcsq[0], &args->str);
+ for (j=1; j<vrec->nvcsq; j++)
+ {
+ kputc_(',', &args->str);
+ kput_vcsq(&vrec->vcsq[j], &args->str);
+ }
+ bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
+ if ( args->hdr_nsmpl )
+ {
+ if ( vrec->nfmt < args->nfmt_bcsq )
+ for (j=1; j<args->hdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl));
+ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt);
+ }
+ vrec->nvcsq = 0;
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ }
+ if ( vbuf->n )
+ {
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos);
+ if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
+ }
+ vbuf->n = 0;
+ }
+
+ for (i=0; i<args->nrm_tr; i++)
+ {
+ tscript_t *tr = args->rm_tr[i];
+ if ( tr->root ) hap_destroy(tr->root);
+ tr->root = NULL;
+ free(tr->hap);
+ free(tr->ref);
+ free(tr->sref);
+ }
+ args->nrm_tr = 0;
+ args->ncsq_buf = 0;
+}
+
+void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+{
+ int i, len;
+ int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
+
+ tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !tr->ref )
+ error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
+
+ int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
+ if ( pad_beg + pad_end != 2*N_REF_PAD )
+ {
+ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD);
+ for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
+ memcpy(ref+i, tr->ref, len);
+ for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
+ free(tr->ref);
+ tr->ref = ref;
+ }
+}
+
+static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+{
+ char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0);
+ char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos);
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) );
+ while ( *ref && *vcf )
+ {
+ if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) )
+ error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]);
+ ref++;
+ vcf++;
+ }
+}
+
+int test_cds_local(args_t *args, bcf1_t *rec)
+{
+ int i,j, ret = 0;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ // structures to fake the normal test_cds machinery
+ hap_node_t root, node;
+ root.type = HAP_ROOT;
+ kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
+
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+
+ if ( !tr->ref )
+ {
+ tscript_init_ref(args, tr, chr);
+ tscript_splice_ref(tr);
+ khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
+
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+
+ int csq_type = node.csq;
+
+ // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though
+ if ( node.type == HAP_SSS )
+ {
+ csq.type.type = csq_type;
+ csq_stage(args, &csq, rec);
+ }
+ else
+ {
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + node.dlen;
+ sseq.s = node.seq;
+ int alen = sseq.l = strlen(sseq.s);
+ int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
+
+ sseq.m = sref.m - 2*N_REF_PAD;
+ sseq.s = sref.s + N_REF_PAD + node.sbeg;
+ sseq.l = node.rlen;
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
+
+ // check for truncating stops
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j]=='*' ) break;
+ if ( j!=tref->l )
+ {
+ tref->l = j+1;
+ tref->s[j+1] = 0;
+ }
+ for (j=0; j<tseq->l; j++)
+ if ( tseq->s[j]=='*' ) break;
+ if ( j!=tseq->l )
+ {
+ tseq->l = j+1;
+ tseq->s[j+1] = 0;
+ }
+ if ( csq_type & CSQ_STOP_LOST )
+ {
+ if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ {
+ csq_type &= ~CSQ_STOP_LOST;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else if (tref->s[tref->l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( tseq->s[tseq->l-1] == '*' )
+ {
+ csq_type &= ~CSQ_STOP_GAINED;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq_type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
+ csq_type &= ~CSQ_START_LOST;
+ if ( node.dlen!=0 )
+ {
+ if ( node.dlen%3 )
+ csq_type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( node.dlen<0 )
+ csq_type |= CSQ_INFRAME_DELETION;
+ else
+ csq_type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j] != tseq->s[j] ) break;
+ if ( j==tref->l )
+ csq_type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( tref->s[j] == '*' )
+ csq_type |= CSQ_STOP_LOST;
+ else if ( tseq->s[j] == '*' )
+ csq_type |= CSQ_STOP_GAINED;
+ else
+ csq_type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( csq_type & CSQ_COMPOUND )
+ {
+ // create the aa variant string
+ kstring_t str = {0,0,0};
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(tref->s, &str);
+ if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(tseq->s, &str);
+ }
+ kputc_('|', &str);
+ kputw(rec->pos+1, &str);
+ kputs(node.var, &str);
+ csq.type.vstr = str;
+ csq.type.type = csq_type & CSQ_COMPOUND;
+ csq_stage(args, &csq, rec);
+
+ // all this only to clean vstr when vrec is flushed
+ if ( !tr->root )
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->root->ncsq_list++;
+ hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
+ csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ rm_csq->type.vstr = str;
+ }
+ if ( csq_type & ~CSQ_COMPOUND )
+ {
+ csq.type.type = csq_type & ~CSQ_COMPOUND;
+ csq.type.vstr.l = 0;
+ csq_stage(args, &csq, rec);
+ }
+ }
+ free(node.seq);
+ free(node.var);
+ }
+ }
+ return ret;
+}
+
+int test_cds(args_t *args, bcf1_t *rec)
+{
+ int i, ret = 0, hap_ret;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+ if ( !tr->root )
+ {
+ // initialize the transcript and its haplotype tree, fetch the reference sequence
+ tscript_init_ref(args, tr, chr);
+
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
+ for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
+ tr->root->nend = tr->nhap;
+ tr->root->type = HAP_ROOT;
+
+ khp_insert(trhp, args->active_tr, &tr);
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ if ( args->phase==PHASE_DROP_GT )
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ }
+ else ret = 1; // prevent reporting as intron in test_tscript
+ free(child);
+ continue;
+ }
+ parent->nend--;
+ parent->nchild = 1;
+ parent->mchild = 1;
+ parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
+ parent->child[0] = child;
+ tr->hap[0] = child;
+ tr->hap[0]->nend = 1;
+ continue;
+ }
+
+ // apply the VCF variants and extend the haplotype tree
+ int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ ngts /= bcf_hdr_nsamples(args->hdr);
+ if ( ngts!=1 && ngts!=2 )
+ {
+ if ( !args->quiet )
+ fprintf(stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ continue;
+ }
+ for (ismpl=0; ismpl<args->smpl->n; ismpl++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
+ if ( gt[0]==bcf_gt_missing ) continue;
+
+ if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+ {
+ if ( args->phase==PHASE_MERGE )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ }
+ if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
+ {
+ if ( args->phase==PHASE_REQUIRE )
+ error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ if ( args->phase==PHASE_SKIP )
+ continue;
+ if ( args->phase==PHASE_NON_REF )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0];
+ }
+ }
+ }
+
+ for (ihap=0; ihap<ngts; ihap++)
+ {
+ if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue;
+
+ i = 2*ismpl + ihap;
+
+ int ial = bcf_gt_allele(gt[ihap]);
+ if ( !ial ) continue;
+ assert( ial < rec->n_allele );
+ if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
+
+ hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
+ {
+ // this haplotype has been seen in another sample
+ tr->hap[i] = parent->child[ parent->cur_child[ial] ];
+ tr->hap[i]->nend++;
+ parent->nend--;
+ continue;
+ }
+
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ }
+ free(child);
+ continue;
+ }
+
+ if ( parent->cur_rec!=rec )
+ {
+ hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
+ for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1;
+ parent->cur_rec = rec;
+ }
+
+ j = parent->nchild++;
+ hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
+ parent->cur_child[ial] = j;
+ parent->child[j] = child;
+ tr->hap[i] = child;
+ tr->hap[i]->nend++;
+ parent->nend--;
+ }
+ }
+ }
+ return ret;
+}
+
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+ // known issues: tab output leads to unsorted output. This is because
+ // coding haplotypes are printed in one go and buffering is not used
+ // with tab output. VCF output is OK though.
+ if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists
+
+ int i,j,ngt = 0;
+ if ( args->phase!=PHASE_DROP_GT )
+ {
+ ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr);
+ }
+ if ( ngt<=0 )
+ {
+ if ( args->output_type==FT_TAB_TEXT )
+ csq_print_text(args, csq, -1,0);
+ return;
+ }
+ assert( ngt<=2 );
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ csq_print_text(args, csq, args->smpl->idx[i],j+1);
+ }
+ }
+ return;
+ }
+
+ vrec_t *vrec = csq->vrec;
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+
+ int icsq = 2*csq->idx + j;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int ismpl = args->smpl->idx[i];
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+ }
+}
+int test_utr(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
+ tscript_t *tr = splice.tr = utr->tr;
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+int test_splice(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.check_acceptor = splice.check_donor = 1;
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
+ splice.tr = exon->tr;
+ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites
+
+ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
+ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ splice_csq(args, &splice, exon->beg, exon->end);
+ if ( splice.csq ) ret = 1;
+ }
+ }
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return ret;
+}
+int test_tscript(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+
+void process(args_t *args, bcf1_t **rec_ptr)
+{
+ if ( !rec_ptr )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ return;
+ }
+
+ bcf1_t *rec = *rec_ptr;
+
+ int call_csq = 1;
+ if ( !rec->n_allele ) call_csq = 0; // no alternate allele
+ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
+ else if ( args->filter )
+ {
+ call_csq = filter_test(args->filter, rec, NULL);
+ if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1;
+ }
+ if ( !call_csq )
+ {
+ if ( !args->out_fh ) return; // not a VCF output
+ vbuf_push(args, rec_ptr);
+ vbuf_flush(args);
+ return;
+ }
+
+ if ( args->rid != rec->rid )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ }
+ args->rid = rec->rid;
+ vbuf_push(args, rec_ptr);
+
+ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec);
+ hit += test_utr(args, rec);
+ hit += test_splice(args, rec);
+ if ( !hit ) test_tscript(args, rec);
+
+ hap_flush(args, rec->pos-1);
+ vbuf_flush(args);
+
+ return;
+}
+
+const char *usage(void)
+{
+ return
+ "\n"
+ "About: Haplotype-aware consequence caller.\n"
+ "Usage: bcftools csq [options] in.vcf\n"
+ "\n"
+ "Required options:\n"
+ " -f, --fasta-ref <file> reference file in fasta format\n"
+ " -g, --gff-annot <file> gff3 annotation file\n"
+ "\n"
+ "CSQ options:\n"
+ " -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
+ " -l, --local-csq localized predictions, consider only one VCF record at a time\n"
+ " -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
+ " -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
+ " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
+ " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
+ " r: require phased GTs, throw an error on unphased het GTs\n"
+ " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
+ " s: skip unphased GTs\n"
+ "Options:\n"
+ " -e, --exclude <expr> exclude sites for which the expression is true\n"
+ " -i, --include <expr> select sites for which the expression is true\n"
+ " -o, --output <file> write output to a file [standard output]\n"
+ " -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
+ " -q, --quiet suppress warning messages. Can be given two times for even less messages\n"
+ " -r, --regions <region> restrict to comma-separated list of regions\n"
+ " -R, --regions-file <file> restrict to regions listed in a file\n"
+ " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file <file> samples to include\n"
+ " -t, --targets <region> similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n"
+ "\n"
+ "Example:\n"
+ " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
+ "\n"
+ " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
+ " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
+ " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
+ "\n";
+}
+
+int main_csq(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_type = FT_VCF;
+ args->bcsq_tag = "BCSQ";
+ args->ncsq_max = 2*16;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"ncsq",1,0,'n'},
+ {"custom-tag",1,0,'c'},
+ {"local-csq",0,0,'l'},
+ {"gff-annot",1,0,'g'},
+ {"fasta-ref",1,0,'f'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,NULL,'O'},
+ {"phase",1,0,'p'},
+ {"quiet",0,0,'q'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {0,0,0,0}
+ };
+ int c, targets_is_file = 0, regions_is_file = 0;
+ char *targets_list = NULL, *regions_list = NULL;
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'l': args->local_csq = 1; break;
+ case 'c': args->bcsq_tag = optarg; break;
+ case 'q': args->quiet++; break;
+ case 'p':
+ switch (optarg[0])
+ {
+ case 'a': args->phase = PHASE_AS_IS; break;
+ case 'm': args->phase = PHASE_MERGE; break;
+ case 'r': args->phase = PHASE_REQUIRE; break;
+ case 'R': args->phase = PHASE_NON_REF; break;
+ case 's': args->phase = PHASE_SKIP; break;
+ default: error("The -p code \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'f': args->fa_fname = optarg; break;
+ case 'g': args->gff_fname = optarg; break;
+ case 'n':
+ args->ncsq_max = 2 * atoi(optarg);
+ if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 't': args->output_type = FT_TAB_TEXT; break;
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': regions_list = optarg; break;
+ case 'R': regions_list = optarg; regions_is_file = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 't': targets_list = optarg; break;
+ case 'T': targets_list = optarg; targets_is_file = 1; break;
+ case 'h':
+ case '?': error("%s",usage());
+ default: error("The option not recognised: %s\n\n", optarg); break;
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else error("%s", usage());
+ }
+ else fname = argv[optind];
+ if ( argc - optind>1 ) error("%s", usage());
+ if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
+ if ( !args->gff_fname ) error("Missing the --gff option\n");
+ args->sr = bcf_sr_init();
+ if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", targets_list);
+ if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", regions_list);
+ if ( !bcf_sr_add_reader(args->sr, fname) )
+ error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum));
+ args->hdr = bcf_sr_get_header(args->sr,0);
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->sr) )
+ {
+ process(args, &args->sr->readers[0].buffer[0]);
+ }
+ process(args,NULL);
+
+ destroy_data(args);
+ bcf_sr_destroy(args->sr);
+ free(args);
+
+ return 0;
+}
+
--- /dev/null
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Things that would be nice to have
+ - for stop-lost events (also in frameshifts) report the number of truncated aa's
+ - memory could be greatly reduced by indexing gff (but it is quite compact already)
+ - deletions that go beyond transcript boundaries are not checked at sequence level
+ - alloc tscript->ref in hap_finalize, introduce fa_off_beg:16,fa_off_end:16
+ - see test/csq/ENST00000573314/insertion-overlap.vcf #1476288882
+
+ Read about transcript types here
+ http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
+ http://www.ensembl.org/info/genome/variation/predicted_data.html
+ http://www.gencodegenes.org/gencode_biotypes.html
+
+ List of supported biotypes
+ antisense
+ IG_C_gene
+ IG_D_gene
+ IG_J_gene
+ IG_LV_gene
+ IG_V_gene
+ lincRNA
+ macro_lncRNA
+ miRNA
+ misc_RNA
+ Mt_rRNA
+ Mt_tRNA
+ polymorphic_pseudogene
+ processed_transcript
+ protein_coding
+ ribozyme
+ rRNA
+ sRNA
+ scRNA
+ scaRNA
+ sense_intronic
+ sense_overlapping
+ snRNA
+ snoRNA
+ TR_C_gene
+ TR_D_gene
+ TR_J_gene
+ TR_V_gene
+
+ The gff parsing logic
+ We collect features such by combining gff lines A,B,C as follows:
+ A .. gene line with a supported biotype
+ A.ID=~/^gene:/
+
+ B .. transcript line referencing A
+ B.ID=~/^transcript:/ && B.Parent=~/^gene:A.ID/
+
+ C .. corresponding CDS, exon, and UTR lines:
+ C[3] in {"CDS","exon","three_prime_UTR","five_prime_UTR"} && C.Parent=~/^transcript:B.ID/
+
+ For coding biotypes ("protein_coding" or "polymorphic_pseudogene") the
+ complete chain link C -> B -> A is required. For the rest, link B -> A suffices.
+
+
+ The supported consequence types, sorted by impact:
+ splice_acceptor_variant .. end region of an intron changed (2bp at the 3' end of an intron)
+ splice_donor_variant .. start region of an intron changed (2bp at the 5' end of an intron)
+ stop_gained .. DNA sequence variant resulting in a stop codon
+ frameshift_variant .. number of inserted/deleted bases not a multiple of three, disrupted translational frame
+ stop_lost .. elongated transcript, stop codon changed
+ start_lost .. the first codon changed
+ inframe_altering .. combination of indels leading to unchanged reading frame and length
+ inframe_insertion .. inserted coding sequence, unchanged reading frame
+ inframe_deletion .. deleted coding sequence, unchanged reading frame
+ missense_variant .. amino acid (aa) change, unchanged length
+ splice_region_variant .. change within 1-3 bases of the exon or 3-8 bases of the intron
+ synonymous_variant .. DNA sequence variant resulting in no amino acid change
+ stop_retained_variant .. different stop codon
+ non_coding_variant .. variant in non-coding sequence, such as RNA gene
+ 5_prime_UTR_variant
+ 3_prime_UTR_variant
+ intron_variant .. reported only if none of the above
+ intergenic_variant .. reported only if none of the above
+
+
+ The annotation algorithm.
+ The algorithm checks if the variant falls in a region of a supported type. The
+ search is performed in the following order, until a match is found:
+ 1. idx_cds(gf_cds_t) - lookup CDS by position, create haplotypes, call consequences
+ 2. idx_utr(gf_utr_t) - check UTR hits
+ 3. idx_exon(gf_exon_t) - check for splice variants
+ 4. idx_tscript(tscript_t) - check for intronic variants, RNAs, etc.
+
+ These regidx indexes are created by parsing a gff3 file as follows:
+ 1. create the array "ftr" of all UTR, CDS, exons. This will be
+ processed later and pruned based on transcript types we want to keep.
+ In the same go, create the hash "id2tr" of transcripts to keep
+ (based on biotype) which maps from transcript_id to a transcript. At
+ the same time also build the hash "gid2gene" which maps from gene_id to
+ gf_gene_t pointer.
+
+ 2. build "idx_cds", "idx_tscript", "idx_utr" and "idx_exon" indexes.
+ Use only features from "ftr" which are present in "id2tr".
+
+ 3. clean data that won't be needed anymore: ftr, id2tr, gid2gene.
+
+ Data structures.
+ idx_cds, idx_utr, idx_exon, idx_tscript:
+ as described above, regidx structures for fast lookup of exons/transcripts
+ overlapping a region, the payload is a pointer to tscript.cds
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <math.h>
+#include <htslib/hts.h>
+#include <htslib/vcf.h>
+#include <htslib/synced_bcf_reader.h>
+#include <htslib/khash.h>
+#include <htslib/khash_str2int.h>
+#include <htslib/kseq.h>
+#include <htslib/faidx.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <ctype.h>
+#include "bcftools.h"
+#include "filter.h"
+#include "regidx.h"
+#include "kheap.h"
+#include "smpl_ilist.h"
+#include "rbuf.h"
+
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+// Logic of the filters: include or exclude sites which match the filters?
+#define FLT_INCLUDE 1
+#define FLT_EXCLUDE 2
+
+// Definition of splice_region, splice_acceptor and splice_donor
+#define N_SPLICE_DONOR 2
+#define N_SPLICE_REGION_EXON 3
+#define N_SPLICE_REGION_INTRON 8
+
+// Ensembl ID format, e.g.
+// ENST00000423372 for human .. ENST%011d
+// ENSMUST00000120394 for mouse .. ENSMUST%011d
+char ENSID_BUF[32], *ENSID_FMT = NULL;
+static inline char *ENSID(uint32_t id)
+{
+ sprintf(ENSID_BUF,ENSID_FMT,id);
+ return ENSID_BUF;
+}
+
+
+#define N_REF_PAD 10 // number of bases to avoid boundary effects
+
+#define STRAND_REV 0
+#define STRAND_FWD 1
+
+#define TRIM_NONE 0
+#define TRIM_5PRIME 1
+#define TRIM_3PRIME 2
+
+// How to treat phased/unphased genotypes
+#define PHASE_REQUIRE 0 // --phase r
+#define PHASE_MERGE 1 // --phase m
+#define PHASE_AS_IS 2 // --phase a
+#define PHASE_SKIP 3 // --phase s
+#define PHASE_NON_REF 4 // --phase R
+#define PHASE_DROP_GT 5 // --samples -
+
+// Node types in the haplotype tree
+#define HAP_CDS 0
+#define HAP_ROOT 1
+#define HAP_SSS 2 // start/stop/splice
+
+#define CSQ_PRINTED_UPSTREAM (1<<0)
+#define CSQ_SYNONYMOUS_VARIANT (1<<1)
+#define CSQ_MISSENSE_VARIANT (1<<2)
+#define CSQ_STOP_LOST (1<<3)
+#define CSQ_STOP_GAINED (1<<4)
+#define CSQ_INFRAME_DELETION (1<<5)
+#define CSQ_INFRAME_INSERTION (1<<6)
+#define CSQ_FRAMESHIFT_VARIANT (1<<7)
+#define CSQ_SPLICE_ACCEPTOR (1<<8)
+#define CSQ_SPLICE_DONOR (1<<9)
+#define CSQ_START_LOST (1<<10)
+#define CSQ_SPLICE_REGION (1<<11)
+#define CSQ_STOP_RETAINED (1<<12)
+#define CSQ_UTR5 (1<<13)
+#define CSQ_UTR3 (1<<14)
+#define CSQ_NON_CODING (1<<15)
+#define CSQ_INTRON (1<<16)
+//#define CSQ_INTERGENIC (1<<17)
+#define CSQ_INFRAME_ALTERING (1<<18)
+#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
+#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
+#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
+
+// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
+#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
+ CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
+ CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
+ CSQ_UPSTREAM_STOP)
+#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST)
+
+#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
+#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
+#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
+
+// see kput_vcsq()
+const char *csq_strings[] =
+{
+ NULL,
+ "synonymous",
+ "missense",
+ "stop_lost",
+ "stop_gained",
+ "inframe_deletion",
+ "inframe_insertion",
+ "frameshift",
+ "splice_acceptor",
+ "splice_donor",
+ "start_lost",
+ "splice_region",
+ "stop_retained",
+ "5_prime_utr",
+ "3_prime_utr",
+ "non_coding",
+ "intron",
+ "intergenic",
+ "inframe_altering",
+ NULL,
+ NULL,
+ "coding_sequence"
+};
+
+
+// GFF line types
+#define GFF_TSCRIPT_LINE 1
+#define GFF_GENE_LINE 2
+
+
+/*
+ Genomic features, for fast lookup by position to overlapping features
+*/
+#define GF_coding_bit 6
+#define GF_is_coding(x) ((x) & (1<<GF_coding_bit))
+#define GF_MT_rRNA 1 // non-coding: 1, 2, ...
+#define GF_MT_tRNA 2
+#define GF_lincRNA 3
+#define GF_miRNA 4
+#define GF_MISC_RNA 5
+#define GF_rRNA 6
+#define GF_snRNA 7
+#define GF_snoRNA 8
+#define GF_PROCESSED_TRANSCRIPT 9
+#define GF_ANTISENSE 10
+#define GF_macro_lncRNA 11
+#define GF_ribozyme 12
+#define GF_sRNA 13
+#define GF_scRNA 14
+#define GF_scaRNA 15
+#define GF_SENSE_INTRONIC 16
+#define GF_SENSE_OVERLAPPING 17
+#define GF_PSEUDOGENE 18
+#define GF_PROCESSED_PSEUDOGENE 19
+#define GF_ARTIFACT 20
+#define GF_IG_PSEUDOGENE 21
+#define GF_IG_C_PSEUDOGENE 22
+#define GF_IG_J_PSEUDOGENE 23
+#define GF_IG_V_PSEUDOGENE 24
+#define GF_TR_V_PSEUDOGENE 25
+#define GF_TR_J_PSEUDOGENE 26
+#define GF_MT_tRNA_PSEUDOGENE 27
+#define GF_misc_RNA_PSEUDOGENE 28
+#define GF_miRNA_PSEUDOGENE 29
+#define GF_RIBOZYME 30
+#define GF_RETAINED_INTRON 31
+#define GF_RETROTRANSPOSED 32
+#define GF_tRNA_PSEUDOGENE 33
+#define GF_TRANSCRIBED_PROCESSED_PSEUDOGENE 34
+#define GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE 35
+#define GF_TRANSCRIBED_UNITARY_PSEUDOGENE 36
+#define GF_TRANSLATED_UNPROCESSED_PSEUDOGENE 37
+#define GF_TRANSLATED_PROCESSED_PSEUDOGENE 38
+#define GF_KNOWN_NCRNA 39
+#define GF_UNITARY_PSEUDOGENE 40
+#define GF_UNPROCESSED_PSEUDOGENE 41
+#define GF_LRG_GENE 42
+#define GF_3PRIME_OVERLAPPING_ncRNA 43
+#define GF_DISRUPTED_DOMAIN 44
+#define GF_vaultRNA 45
+#define GF_BIDIRECTIONAL_PROMOTER_lncRNA 46
+#define GF_AMBIGUOUS_ORF 47
+#define GF_PROTEIN_CODING (1|(1<<GF_coding_bit)) // coding: 65, 66, ...
+#define GF_POLYMORPHIC_PSEUDOGENE (2|(1<<GF_coding_bit))
+#define GF_IG_C (3|(1<<GF_coding_bit))
+#define GF_IG_D (4|(1<<GF_coding_bit))
+#define GF_IG_J (5|(1<<GF_coding_bit))
+#define GF_IG_LV (6|(1<<GF_coding_bit))
+#define GF_IG_V (7|(1<<GF_coding_bit))
+#define GF_TR_C (8|(1<<GF_coding_bit))
+#define GF_TR_D (9|(1<<GF_coding_bit))
+#define GF_TR_J (10|(1<<GF_coding_bit))
+#define GF_TR_V (11|(1<<GF_coding_bit))
+#define GF_NMD (12|(1<<GF_coding_bit))
+#define GF_NON_STOP_DECAY (13|(1<<GF_coding_bit))
+#define GF_CDS ((1<<(GF_coding_bit+1))+1) // special types: 129, 130, ...
+#define GF_EXON ((1<<(GF_coding_bit+1))+2)
+#define GF_UTR3 ((1<<(GF_coding_bit+1))+3)
+#define GF_UTR5 ((1<<(GF_coding_bit+1))+4)
+// GF_MAX = (1<<30)-1, see hap_node_t
+
+typedef struct _tscript_t tscript_t;
+typedef struct
+{
+ tscript_t *tr; // transcript
+ uint32_t beg; // the start coordinate of the CDS (on the reference strand, 0-based)
+ uint32_t pos; // 0-based index of the first exon base within the transcript (only to
+ // update hap_node_t.sbeg in hap_init, could be calculated on the fly)
+ uint32_t len; // exon length
+ uint32_t icds:30, // exon index within the transcript
+ phase:2; // offset of the CDS
+}
+gf_cds_t;
+typedef struct
+{
+ char *name; // human readable name, e.g. ORF45
+ uint8_t iseq;
+}
+gf_gene_t;
+typedef struct
+{
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_exon_t;
+typedef enum { prime3, prime5 } utr_t;
+typedef struct
+{
+ utr_t which;
+ uint32_t beg,end;
+ tscript_t *tr;
+}
+gf_utr_t;
+
+
+/*
+ Structures related to VCF output:
+
+ vcsq_t
+ information required to assemble consequence lines such as "inframe_deletion|XYZ|ENST01|+|5TY>5I|121ACG>A+124TA>T"
+
+ vcrec_t
+ single VCF record and csq tied to this record. (Haplotype can have multiple
+ consequences in several VCF records. Each record can have multiple consequences
+ from multiple haplotypes.)
+
+ csq_t
+ a top-level consequence tied to a haplotype
+
+ vbuf_t
+ pos2vbuf
+ VCF records with the same position clustered together for a fast lookup via pos2vbuf
+*/
+typedef struct _vbuf_t vbuf_t;
+typedef struct _vcsq_t vcsq_t;
+struct _vcsq_t
+{
+ uint32_t strand:1,
+ type:31; // one of CSQ_* types
+ uint32_t trid;
+ uint32_t biotype; // one of GF_* types
+ char *gene; // gene name
+ bcf1_t *ref; // if type&CSQ_PRINTED_UPSTREAM, ref consequence "@1234"
+ kstring_t vstr; // variant string, eg 5TY>5I|121ACG>A+124TA>T
+};
+typedef struct
+{
+ bcf1_t *line;
+ uint32_t *smpl; // bitmask of sample consequences with first/second haplotype interleaved
+ uint32_t nfmt:4, nvcsq:28, mvcsq;
+ vcsq_t *vcsq; // there can be multiple consequences for a single VCF record
+}
+vrec_t;
+typedef struct
+{
+ uint32_t pos;
+ vrec_t *vrec; // vcf line that this csq is tied to; needed when printing haplotypes (hap_stage_vcf)
+ int idx; // 0-based index of the csq at the VCF line, for FMT/BCSQ
+ vcsq_t type;
+}
+csq_t;
+struct _vbuf_t
+{
+ vrec_t **vrec; // buffer of VCF lines with the same position
+ int n, m;
+};
+KHASH_MAP_INIT_INT(pos2vbuf, vbuf_t*)
+
+
+/*
+ Structures related to haplotype-aware consequences in coding regions
+
+ hap_node_t
+ node of a haplotype tree. Each transcript has one tree
+
+ tscript_t
+ despite its general name, it is intended for coding transcripts only
+
+ hap_t
+ hstack_t
+ for traversal of the haplotype tree and braking combined
+ consequences into independent parts
+*/
+typedef struct _hap_node_t hap_node_t;
+struct _hap_node_t
+{
+ char *seq; // cds segment [parent_node,this_node)
+ char *var; // variant "ref>alt"
+ uint32_t type:2, // HAP_ROOT or HAP_CDS
+ csq:30; // this node's consequence
+ int dlen; // alt minus ref length: <0 del, >0 ins, 0 substitution
+ uint32_t rbeg; // variant's VCF position (0-based, inclusive)
+ int32_t rlen; // variant's rlen; alen=rlen+dlen; fake for non CDS types
+ uint32_t sbeg; // variant's position on the spliced reference transcript (0-based, inclusive, N_REF_PAD not included)
+ uint32_t icds; // which exon does this node's variant overlaps
+ hap_node_t **child, *prev; // children haplotypes and previous coding node
+ int nchild, mchild;
+ bcf1_t *cur_rec, *rec; // current VCF record and node's VCF record
+ uint32_t nend; // number of haplotypes ending in this node
+ int *cur_child, mcur_child; // mapping from the allele to the currently active child
+ csq_t *csq_list; // list of haplotype's consequences, broken by position
+ int ncsq_list, mcsq_list;
+};
+struct _tscript_t
+{
+ uint32_t id; // transcript id
+ uint32_t beg,end; // transcript's beg and end coordinate (ref strand, 0-based, inclusive)
+ uint32_t strand:1, // STRAND_REV or STRAND_FWD
+ ncds:31, // number of exons
+ mcds;
+ gf_cds_t **cds; // ordered list of exons
+ char *ref; // reference sequence, padded with N_REF_PAD bases on both ends
+ char *sref; // spliced reference sequence, padded with N_REF_PAD bases on both ends
+ hap_node_t *root; // root of the haplotype tree
+ hap_node_t **hap; // pointer to haplotype leaves, two for each sample
+ int nhap, nsref; // number of haplotypes and length of sref, including 2*N_REF_PAD
+ uint32_t trim:2, // complete, 5' or 3' trimmed, see TRIM_* types
+ type:30; // one of GF_* types
+ gf_gene_t *gene;
+};
+static inline int cmp_tscript(tscript_t **a, tscript_t **b)
+{
+ return ( (*a)->end < (*b)->end ) ? 1 : 0;
+}
+KHEAP_INIT(trhp, tscript_t*, cmp_tscript)
+typedef khp_trhp_t tr_heap_t;
+typedef struct
+{
+ hap_node_t *node; // current node
+ int ichild; // current child in the active node
+ int dlen; // total dlen, from the root to the active node
+ size_t slen; // total sequence length, from the root to the active node
+}
+hstack_t;
+typedef struct
+{
+ int mstack;
+ hstack_t *stack;
+ tscript_t *tr; // tr->ref: spliced transcript on ref strand
+ kstring_t sseq; // spliced haplotype sequence on ref strand
+ kstring_t tseq; // the variable part of translated haplotype transcript, coding strand
+ kstring_t tref; // the variable part of translated reference transcript, coding strand
+ uint32_t sbeg; // stack's sbeg, for cases first node's type is HAP_SSS
+ int upstream_stop;
+}
+hap_t;
+
+
+/*
+ Helper structures, only for initialization
+
+ ftr_t
+ temporary list of all exons, CDS, UTRs
+*/
+KHASH_MAP_INIT_INT(int2tscript, tscript_t*)
+KHASH_MAP_INIT_INT(int2int, int)
+KHASH_MAP_INIT_INT(int2gene, gf_gene_t*)
+typedef struct
+{
+ int type; // GF_CDS, GF_EXON, GF_5UTR, GF_3UTR
+ uint32_t beg;
+ uint32_t end;
+ uint32_t trid;
+ uint32_t strand:1; // STRAND_REV,STRAND_FWD
+ uint32_t phase:2; // 0, 1 or 2
+ uint32_t iseq:29;
+}
+ftr_t;
+typedef struct
+{
+ // all exons, CDS, UTRs
+ ftr_t *ftr;
+ int nftr, mftr;
+
+ // mapping from transcript ensembl id to gene id
+ kh_int2gene_t *gid2gene;
+
+ // mapping from transcript id to tscript, for quick CDS anchoring
+ kh_int2tscript_t *id2tr;
+
+ // sequences
+ void *seq2int;
+ char **seq;
+ int nseq, mseq;
+
+ // ignored biotypes
+ void *ignored_biotypes;
+}
+aux_t;
+
+typedef struct _args_t
+{
+ // the main regidx lookups, from chr:beg-end to overlapping features and
+ // index iterator
+ regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
+ regitr_t *itr;
+
+ // temporary structures, deleted after initializtion
+ aux_t init;
+
+ // text tab-delimited output (out) or vcf/bcf output (out_fh)
+ FILE *out;
+ htsFile *out_fh;
+
+ // vcf
+ bcf_srs_t *sr;
+ bcf_hdr_t *hdr;
+ int hdr_nsmpl; // actual number of samples in the vcf, for bcf_update_format_values()
+
+ // include or exclude sites which match the filters
+ filter_t *filter;
+ char *filter_str;
+ int filter_logic; // FLT_INCLUDE or FLT_EXCLUDE
+
+ // samples to process
+ int sample_is_file;
+ char *sample_list;
+ smpl_ilist_t *smpl;
+
+ char *outdir, **argv, *fa_fname, *gff_fname, *output_fname;
+ char *bcsq_tag;
+ int argc, output_type;
+ int phase, quiet, local_csq;
+ int ncsq_max, nfmt_bcsq; // maximum number of csq per site that can be accessed from FORMAT/BCSQ
+ int ncsq_small_warned;
+
+ int rid; // current chromosome
+ tr_heap_t *active_tr; // heap of active transcripts for quick flushing
+ hap_t *hap; // transcript haplotype recursion
+ vbuf_t **vcf_buf; // buffered VCF lines to annotate with CSQ and flush
+ rbuf_t vcf_rbuf; // round buffer indexes to vcf_buf
+ kh_pos2vbuf_t *pos2vbuf; // fast lookup of buffered lines by position
+ tscript_t **rm_tr; // buffer of transcripts to clean
+ int nrm_tr, mrm_tr;
+ csq_t *csq_buf; // pool of csq not managed by hap_node_t, i.e. non-CDS csqs
+ int ncsq_buf, mcsq_buf;
+
+ faidx_t *fai;
+ kstring_t str, str2;
+ int32_t *gt_arr, mgt_arr;
+}
+args_t;
+
+// AAA, AAC, ...
+const char *gencode = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF";
+const uint8_t nt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,0,4,1, 4,4,4,2, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 3
+};
+const uint8_t cnt4[] =
+{
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0,4,4,4, 4,4,4,4, 4,4,4,4,
+ 4,3,4,2, 4,4,4,1, 4,4,4,4, 4,4,4,4,
+ 4,4,4,4, 0
+};
+#define dna2aa(x) gencode[ nt4[(uint8_t)(x)[0]]<<4 | nt4[(uint8_t)(x)[1]]<<2 | nt4[(uint8_t)(x)[2]] ]
+#define cdna2aa(x) gencode[ cnt4[(uint8_t)(x)[2]]<<4 | cnt4[(uint8_t)(x)[1]]<<2 | cnt4[(uint8_t)(x)[0]] ]
+
+static const char *gf_strings_noncoding[] =
+{
+ "MT_rRNA", "MT_tRNA", "lincRNA", "miRNA", "misc_RNA", "rRNA", "snRNA", "snoRNA", "processed_transcript",
+ "antisense", "macro_lncRNA", "ribozyme", "sRNA", "scRNA", "scaRNA", "sense_intronic", "sense_overlapping",
+ "pseudogene", "processed_pseudogene", "artifact", "IG_pseudogene", "IG_C_pseudogene", "IG_J_pseudogene",
+ "IG_V_pseudogene", "TR_V_pseudogene", "TR_J_pseudogene", "MT_tRNA_pseudogene", "misc_RNA_pseudogene",
+ "miRNA_pseudogene", "ribozyme", "retained_intron", "retrotransposed", "Trna_pseudogene", "transcribed_processed_pseudogene",
+ "transcribed_unprocessed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene",
+ "translated_processed_pseudogene", "known_ncRNA", "unitary_pseudogene", "unprocessed_pseudogene",
+ "LRG_gene", "3_prime_overlapping_ncRNA", "disrupted_domain", "vaultRNA", "bidirectional_promoter_lncRNA", "ambiguous_orf"
+};
+static const char *gf_strings_coding[] = { "protein_coding", "polymorphic_pseudogene", "IG_C", "IG_D", "IG_J", "IG_LV", "IG_V", "TR_C", "TR_D", "TR_J", "TR_V", "NMD", "non_stop_decay"};
+static const char *gf_strings_special[] = { "CDS", "exon", "3_prime_UTR", "5_prime_UTR" };
+
+const char *gf_type2gff_string(int type)
+{
+ if ( !GF_is_coding(type) )
+ {
+ if ( type < (1<<GF_coding_bit) ) return gf_strings_noncoding[type-1];
+ type &= (1<<(GF_coding_bit+1)) - 1;
+ return gf_strings_special[type - 1];
+ }
+ type &= (1<<GF_coding_bit) - 1;
+ return gf_strings_coding[type - 1];
+}
+
+/*
+ gff parsing functions
+*/
+static inline int feature_set_seq(args_t *args, char *chr_beg, char *chr_end)
+{
+ aux_t *aux = &args->init;
+ char c = chr_end[1];
+ chr_end[1] = 0;
+ int iseq;
+ if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+ {
+ hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
+ aux->seq[aux->nseq] = strdup(chr_beg);
+ iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+ aux->nseq++;
+ assert( aux->nseq < 256 ); // see gf_gene_t.iseq
+ }
+ chr_end[1] = c;
+ return iseq;
+}
+static inline char *gff_skip(const char *line, char *ss)
+{
+ while ( *ss && *ss!='\t' ) ss++;
+ if ( !*ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return ss+1;
+}
+static inline void gff_parse_chr(const char *line, char **chr_beg, char **chr_end)
+{
+ char *se = (char*) line;
+ while ( *se && *se!='\t' ) se++;
+ if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ *chr_beg = (char*) line;
+ *chr_end = se-1;
+}
+static inline char *gff_parse_beg_end(const char *line, char *ss, uint32_t *beg, uint32_t *end)
+{
+ char *se = ss;
+ *beg = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line:\n\t%s\n\t%s\n",__FILE__,__LINE__,__FUNCTION__,line,ss);
+ ss = se+1;
+ *end = strtol(ss, &se, 10) - 1;
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ return se+1;
+}
+static inline uint32_t gff_parse_id(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ while ( *ss && !isdigit(*ss) ) ss++;
+ if ( !ss ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ char *se;
+ uint32_t id = strtol(ss, &se, 10);
+ if ( ss==se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__, line);
+ if ( *se && *se!=';' && *se!='\t' ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
+ assert( id <= 0xffffff ); // see gf_gene_t.id. Ensembl IDs are never that big in practice
+ return id;
+}
+static void gff_parse_ensid_fmt(const char *line, const char *needle, char *ss)
+{
+ ss = strstr(ss,needle);
+ if ( !ss ) error("[%s:%d %s] Could not parse the line, \"%s\" not present: %s\n",__FILE__,__LINE__,__FUNCTION__,needle,line);
+ ss += strlen(needle);
+ char *se = ss;
+ while ( *se && !isdigit(*se) ) se++;
+ kstring_t str = {0,0,0};
+ kputsn(ss,se-ss,&str);
+ ss = se;
+ while ( *se && isdigit(*se) ) se++;
+ ksprintf(&str,"%%0%dd",(int)(se-ss));
+ ENSID_FMT = str.s;
+}
+static inline int gff_parse_type(char *line)
+{
+ line = strstr(line,"ID=");
+ if ( !line ) return -1;
+ line += 3;
+ if ( !strncmp(line,"transcript:",11) ) return GFF_TSCRIPT_LINE;
+ else if ( !strncmp(line,"gene:",5) ) return GFF_GENE_LINE;
+ return -1;
+}
+static inline int gff_parse_biotype(char *_line)
+{
+ char *line = strstr(_line,"biotype=");
+ if ( !line ) return -1;
+
+ line += 8;
+ switch (*line)
+ {
+ case 'p':
+ if ( !strncmp(line,"protein_coding",14) ) return GF_PROTEIN_CODING;
+ else if ( !strncmp(line,"pseudogene",10) ) return GF_PSEUDOGENE;
+ else if ( !strncmp(line,"processed_transcript",20) ) return GF_PROCESSED_TRANSCRIPT;
+ else if ( !strncmp(line,"processed_pseudogene",20) ) return GF_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"polymorphic_pseudogene",22) ) return GF_POLYMORPHIC_PSEUDOGENE;
+ break;
+ case 'a':
+ if ( !strncmp(line,"artifact",8) ) return GF_ARTIFACT;
+ else if ( !strncmp(line,"antisense",9) ) return GF_ANTISENSE;
+ else if ( !strncmp(line,"ambiguous_orf",13) ) return GF_AMBIGUOUS_ORF;
+ break;
+ case 'I':
+ if ( !strncmp(line,"IG_C_gene",9) ) return GF_IG_C;
+ else if ( !strncmp(line,"IG_D_gene",9) ) return GF_IG_D;
+ else if ( !strncmp(line,"IG_J_gene",9) ) return GF_IG_J;
+ else if ( !strncmp(line,"IG_LV_gene",10) ) return GF_IG_LV;
+ else if ( !strncmp(line,"IG_V_gene",9) ) return GF_IG_V;
+ else if ( !strncmp(line,"IG_pseudogene",13) ) return GF_IG_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_C_pseudogene",15) ) return GF_IG_C_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_J_pseudogene",15) ) return GF_IG_J_PSEUDOGENE;
+ else if ( !strncmp(line,"IG_V_pseudogene",15) ) return GF_IG_V_PSEUDOGENE;
+ break;
+ case 'T':
+ if ( !strncmp(line,"TR_C_gene",9) ) return GF_TR_C;
+ else if ( !strncmp(line,"TR_D_gene",9) ) return GF_TR_D;
+ else if ( !strncmp(line,"TR_J_gene",9) ) return GF_TR_J;
+ else if ( !strncmp(line,"TR_V_gene",9) ) return GF_TR_V;
+ else if ( !strncmp(line,"TR_V_pseudogene",15) ) return GF_TR_V_PSEUDOGENE;
+ else if ( !strncmp(line,"TR_J_pseudogene",15) ) return GF_TR_J_PSEUDOGENE;
+ break;
+ case 'M':
+ if ( !strncmp(line,"Mt_tRNA_pseudogene",18) ) return GF_MT_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"Mt_tRNA",7) ) return GF_MT_tRNA;
+ else if ( !strncmp(line,"Mt_rRNA",7) ) return GF_MT_tRNA;
+ break;
+ case 'l':
+ if ( !strncmp(line,"lincRNA",7) ) return GF_lincRNA;
+ break;
+ case 'm':
+ if ( !strncmp(line,"macro_lncRNA",12) ) return GF_macro_lncRNA;
+ else if ( !strncmp(line,"misc_RNA_pseudogene",19) ) return GF_misc_RNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA_pseudogene",16) ) return GF_miRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"miRNA",5) ) return GF_miRNA;
+ else if ( !strncmp(line,"misc_RNA",8) ) return GF_MISC_RNA;
+ break;
+ case 'r':
+ if ( !strncmp(line,"rRNA",4) ) return GF_rRNA;
+ else if ( !strncmp(line,"ribozyme",8) ) return GF_RIBOZYME;
+ else if ( !strncmp(line,"retained_intron",15) ) return GF_RETAINED_INTRON;
+ else if ( !strncmp(line,"retrotransposed",15) ) return GF_RETROTRANSPOSED;
+ break;
+ case 's':
+ if ( !strncmp(line,"snRNA",5) ) return GF_snRNA;
+ else if ( !strncmp(line,"sRNA",4) ) return GF_sRNA;
+ else if ( !strncmp(line,"scRNA",5) ) return GF_scRNA;
+ else if ( !strncmp(line,"scaRNA",6) ) return GF_scaRNA;
+ else if ( !strncmp(line,"snoRNA",6) ) return GF_snoRNA;
+ else if ( !strncmp(line,"sense_intronic",14) ) return GF_SENSE_INTRONIC;
+ else if ( !strncmp(line,"sense_overlapping",17) ) return GF_SENSE_OVERLAPPING;
+ break;
+ case 't':
+ if ( !strncmp(line,"tRNA_pseudogene",15) ) return GF_tRNA_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_processed_pseudogene",32) ) return GF_TRANSCRIBED_PROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unprocessed_pseudogene",34) ) return GF_TRANSCRIBED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"transcribed_unitary_pseudogene",30) ) return GF_TRANSCRIBED_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_unprocessed_pseudogene",33) ) return GF_TRANSLATED_UNPROCESSED_PSEUDOGENE;
+ else if ( !strncmp(line,"translated_processed_pseudogene",31) ) return GF_TRANSLATED_PROCESSED_PSEUDOGENE;
+ break;
+ case 'n':
+ if ( !strncmp(line,"nonsense_mediated_decay",23) ) return GF_NMD;
+ else if ( !strncmp(line,"non_stop_decay",14) ) return GF_NON_STOP_DECAY;
+ break;
+ case 'k':
+ if ( !strncmp(line,"known_ncrna",11) ) return GF_KNOWN_NCRNA;
+ break;
+ case 'u':
+ if ( !strncmp(line,"unitary_pseudogene",18) ) return GF_UNITARY_PSEUDOGENE;
+ else if ( !strncmp(line,"unprocessed_pseudogene",22) ) return GF_UNPROCESSED_PSEUDOGENE;
+ break;
+ case 'L':
+ if ( !strncmp(line,"LRG_gene",8) ) return GF_LRG_GENE;
+ break;
+ case '3':
+ if ( !strncmp(line,"3prime_overlapping_ncRNA",24) ) return GF_3PRIME_OVERLAPPING_ncRNA;
+ break;
+ case 'd':
+ if ( !strncmp(line,"disrupted_domain",16) ) return GF_DISRUPTED_DOMAIN;
+ break;
+ case 'v':
+ if ( !strncmp(line,"vaultRNA",8) ) return GF_vaultRNA;
+ break;
+ case 'b':
+ if ( !strncmp(line,"bidirectional_promoter_lncRNA",29) ) return GF_BIDIRECTIONAL_PROMOTER_lncRNA;
+ break;
+ }
+ return 0;
+}
+static inline int gff_ignored_biotype(args_t *args, char *ss)
+{
+ ss = strstr(ss,"biotype=");
+ if ( !ss ) return 0;
+
+ ss += 8;
+ char *se = ss, tmp;
+ while ( *se && *se!=';' ) se++;
+ tmp = *se;
+ *se = 0;
+
+ char *key = ss;
+ int n = 0;
+ if ( khash_str2int_get(args->init.ignored_biotypes, ss, &n)!=0 ) key = strdup(ss);
+ khash_str2int_set(args->init.ignored_biotypes, key, n+1);
+
+ *se = tmp;
+ return 1;
+}
+gf_gene_t *gene_init(aux_t *aux, uint32_t gene_id)
+{
+ khint_t k = kh_get(int2gene, aux->gid2gene, (int)gene_id);
+ gf_gene_t *gene = (k == kh_end(aux->gid2gene)) ? NULL : kh_val(aux->gid2gene, k);
+ if ( !gene )
+ {
+ gene = (gf_gene_t*) calloc(1,sizeof(gf_gene_t));
+ int ret;
+ k = kh_put(int2gene, aux->gid2gene, (int)gene_id, &ret);
+ kh_val(aux->gid2gene,k) = gene;
+ }
+ return gene;
+}
+void gff_parse_transcript(args_t *args, const char *line, char *ss, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored transcript: %s\n",line);
+ return;
+ }
+
+ // create a mapping from transcript_id to gene_id
+ uint32_t trid = gff_parse_id(line, "ID=transcript:", ss);
+ uint32_t gene_id = gff_parse_id(line, "Parent=gene:", ss);
+
+ if ( !ENSID_FMT ) gff_parse_ensid_fmt(line, "ID=transcript:", ss); // id prefix different across species
+
+ tscript_t *tr = (tscript_t*) calloc(1,sizeof(tscript_t));
+ tr->id = trid;
+ tr->strand = ftr->strand;
+ tr->gene = gene_init(aux, gene_id);
+ tr->type = biotype;
+ tr->beg = ftr->beg;
+ tr->end = ftr->end;
+
+ khint_t k;
+ int ret;
+ k = kh_put(int2tscript, aux->id2tr, (int)trid, &ret);
+ kh_val(aux->id2tr,k) = tr;
+}
+void gff_parse_gene(args_t *args, const char *line, char *ss, char *chr_beg, char *chr_end, ftr_t *ftr)
+{
+ int biotype = gff_parse_biotype(ss);
+ if ( biotype <= 0 )
+ {
+ if ( !gff_ignored_biotype(args, ss) && args->quiet<2 ) fprintf(pysam_stderr,"ignored gene: %s\n",line);
+ return;
+ }
+
+ aux_t *aux = &args->init;
+
+ // substring search for "ID=gene:ENSG00000437963"
+ uint32_t gene_id = gff_parse_id(line, "ID=gene:", ss);
+ gf_gene_t *gene = gene_init(aux, gene_id);
+ assert( !gene->name ); // the gene_id should be unique
+
+ gene->iseq = feature_set_seq(args, chr_beg,chr_end);
+
+ // substring search for "Name=OR4F5"
+ ss = strstr(chr_end+2,"Name=");
+ if ( !ss ) error("Could not parse the line, \"Name=\" not present: %s\n", line);
+ ss += 5;
+ char *se = ss;
+ while ( *se && *se!=';' && !isspace(*se) ) se++;
+ gene->name = (char*) malloc(se-ss+1);
+ memcpy(gene->name,ss,se-ss);
+ gene->name[se-ss] = 0;
+}
+int gff_parse(args_t *args, char *line, ftr_t *ftr)
+{
+ // - skip empty lines and commented lines
+ // - columns
+ // 1. chr
+ // 2. <skip>
+ // 3. CDS, transcript, gene, ...
+ // 4-5. beg,end
+ // 6. <skip>
+ // 7. strand
+ // 8. phase
+ // 9. Parent=transcript:ENST(\d+);ID=... etc
+
+ char *ss = line;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *chr_beg, *chr_end;
+ gff_parse_chr(line, &chr_beg, &chr_end);
+ ss = gff_skip(line, chr_end + 2);
+
+ // 3. column: is this a CDS, transcript, gene, etc.
+ if ( !strncmp("exon\t",ss,5) ) { ftr->type = GF_EXON; ss += 5; }
+ else if ( !strncmp("CDS\t",ss,4) ) { ftr->type = GF_CDS; ss += 4; }
+ else if ( !strncmp("three_prime_UTR\t",ss,16) ) { ftr->type = GF_UTR3; ss += 16; }
+ else if ( !strncmp("five_prime_UTR\t",ss,15) ) { ftr->type = GF_UTR5; ss += 15; }
+ else
+ {
+ ss = gff_skip(line, ss);
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+ int type = gff_parse_type(ss);
+ if ( type!=GFF_TSCRIPT_LINE && type!=GFF_GENE_LINE )
+ {
+ // we ignore these, debug print to see new types:
+ ss = strstr(ss,"ID=");
+ if ( !ss ) return -1; // no ID, ignore the line
+ if ( !strncmp("chromosome",ss+3,10) ) return -1;
+ if ( !strncmp("supercontig",ss+3,11) ) return -1;
+ if ( args->quiet<2 ) fprintf(pysam_stderr,"ignored: %s\n", line);
+ return -1;
+ }
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else error("Unknown strand: %c .. %s\n", *ss,ss);
+
+ if ( type==GFF_TSCRIPT_LINE )
+ gff_parse_transcript(args, line, ss, ftr);
+ else
+ gff_parse_gene(args, line, ss, chr_beg, chr_end, ftr);
+
+ return -1;
+ }
+ ss = gff_parse_beg_end(line, ss, &ftr->beg,&ftr->end);
+ ss = gff_skip(line, ss);
+
+ // 7. column: strand
+ if ( *ss == '+' ) ftr->strand = STRAND_FWD;
+ else if ( *ss == '-' ) ftr->strand = STRAND_REV;
+ else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown strand: %c\n", *ss); return -1; }
+ ss += 2;
+
+ // 8. column: phase (codon offset)
+ if ( *ss == '0' ) ftr->phase = 0;
+ else if ( *ss == '1' ) ftr->phase = 1;
+ else if ( *ss == '2' ) ftr->phase = 2;
+ else if ( *ss == '.' ) ftr->phase = 0; // exons do not have phase
+ else { if ( args->quiet<2 ) fprintf(pysam_stderr,"Skipping unknown phase: %c, %s\n", *ss, line); return -1; }
+ ss += 2;
+
+ // substring search for "Parent=transcript:ENST00000437963"
+ ftr->trid = gff_parse_id(line, "Parent=transcript:", ss);
+ ftr->iseq = feature_set_seq(args, chr_beg,chr_end);
+ return 0;
+}
+
+static int cmp_cds_ptr(const void *a, const void *b)
+{
+ // comparison function for qsort of transcripts's CDS
+ if ( (*((gf_cds_t**)a))->beg < (*((gf_cds_t**)b))->beg ) return -1;
+ if ( (*((gf_cds_t**)a))->beg > (*((gf_cds_t**)b))->beg ) return 1;
+ return 0;
+}
+
+static inline void chr_beg_end(aux_t *aux, int iseq, char **chr_beg, char **chr_end)
+{
+ *chr_beg = *chr_end = aux->seq[iseq];
+ while ( (*chr_end)[1] ) (*chr_end)++;
+}
+tscript_t *tscript_init(aux_t *aux, uint32_t trid)
+{
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)trid);
+ tscript_t *tr = (k == kh_end(aux->id2tr)) ? NULL : kh_val(aux->id2tr, k);
+ assert( tr );
+ return tr;
+}
+void register_cds(args_t *args, ftr_t *ftr)
+{
+ // Make the CDS searchable via idx_cds. Note we do not malloc tr->cds just yet.
+ // ftr is the result of parsing a gff CDS line
+ aux_t *aux = &args->init;
+
+ tscript_t *tr = tscript_init(aux, ftr->trid);
+ if ( tr->strand != ftr->strand ) error("Conflicting strand in transcript %"PRIu32" .. %d vs %d\n",ftr->trid,tr->strand,ftr->strand);
+
+ gf_cds_t *cds = (gf_cds_t*) malloc(sizeof(gf_cds_t));
+ cds->tr = tr;
+ cds->beg = ftr->beg;
+ cds->len = ftr->end - ftr->beg + 1;
+ cds->icds = 0; // to keep valgrind on mac happy
+ cds->phase = ftr->phase;
+
+ hts_expand(gf_cds_t*,tr->ncds+1,tr->mcds,tr->cds);
+ tr->cds[tr->ncds++] = cds;
+}
+void register_utr(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_utr_t *utr = (gf_utr_t*) malloc(sizeof(gf_utr_t));
+ utr->which = ftr->type==GF_UTR3 ? prime3 : prime5;
+ utr->beg = ftr->beg;
+ utr->end = ftr->end;
+ utr->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, utr->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_utr, chr_beg,chr_end, utr->beg,utr->end, &utr);
+}
+void register_exon(args_t *args, ftr_t *ftr)
+{
+ aux_t *aux = &args->init;
+ gf_exon_t *exon = (gf_exon_t*) malloc(sizeof(gf_exon_t));
+ exon->beg = ftr->beg;
+ exon->end = ftr->end;
+ exon->tr = tscript_init(aux, ftr->trid);
+
+ char *chr_beg, *chr_end;
+ chr_beg_end(&args->init, exon->tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_exon, chr_beg,chr_end, exon->beg - N_SPLICE_REGION_INTRON, exon->end + N_SPLICE_REGION_INTRON, &exon);
+}
+
+void tscript_init_cds(args_t *args)
+{
+ aux_t *aux = &args->init;
+
+ // Sort CDS in all transcripts, set offsets, check their phase, length, create index (idx_cds)
+ khint_t k;
+ for (k=0; k<kh_end(aux->id2tr); k++)
+ {
+ if ( !kh_exist(aux->id2tr, k) ) continue;
+ tscript_t *tr = (tscript_t*) kh_val(aux->id2tr, k);
+
+ // position-to-tscript lookup
+ char *chr_beg, *chr_end;
+ chr_beg_end(aux, tr->gene->iseq, &chr_beg, &chr_end);
+ regidx_push(args->idx_tscript, chr_beg, chr_end, tr->beg, tr->end, &tr);
+
+ if ( !tr->ncds ) continue; // transcript with no CDS
+
+ // sort CDs
+ qsort(tr->cds, tr->ncds, sizeof(gf_cds_t*), cmp_cds_ptr);
+
+ // trim non-coding start
+ int i, len = 0;
+ if ( tr->strand==STRAND_FWD )
+ {
+ if ( tr->cds[0]->phase ) tr->trim |= TRIM_5PRIME;
+ tr->cds[0]->beg += tr->cds[0]->phase;
+ tr->cds[0]->len -= tr->cds[0]->phase;
+ tr->cds[0]->phase = 0;
+
+ // sanity check phase
+ for (i=0; i<tr->ncds; i++)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ assert( phase == len%3 );
+ len += tr->cds[i]->len;
+ }
+ }
+ else
+ {
+ // Check that the phase is not bigger than CDS length. Curiously, this can really happen,
+ // see Mus_musculus.GRCm38.85.gff3.gz, transcript:ENSMUST00000163141
+ // todo: the same for the fwd strand
+ i = tr->ncds - 1;
+ int phase = tr->cds[i]->phase;
+ if ( phase ) tr->trim |= TRIM_5PRIME;
+ while ( i>=0 && phase > tr->cds[i]->len )
+ {
+ phase -= tr->cds[i]->len;
+ tr->cds[i]->phase = 0;
+ tr->cds[i]->len = 0;
+ i--;
+ }
+ tr->cds[i]->len -= tr->cds[i]->phase;
+ tr->cds[i]->phase = 0;
+
+ // sanity check phase
+ for (i=tr->ncds-1; i>=0; i--)
+ {
+ int phase = tr->cds[i]->phase ? 3 - tr->cds[i]->phase : 0;
+ if ( phase!=len%3)
+ error("GFF3 assumption failed for transcript %s, CDS=%d: phase!=len%%3 (phase=%d, len=%d)\n",ENSID(tr->id),tr->cds[i]->beg+1,phase,len);
+ len += tr->cds[i]->len;
+ }
+ }
+
+ // set len. At the same check that CDS within a transcript do not overlap
+ len = 0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->icds = i;
+ len += tr->cds[i]->len;
+ if ( !i ) continue;
+
+ gf_cds_t *a = tr->cds[i-1];
+ gf_cds_t *b = tr->cds[i];
+ if ( a->beg + a->len - 1 >= b->beg )
+ error("Error: CDS overlap in the transcript %"PRIu32": %"PRIu32"-%"PRIu32" and %"PRIu32"-%"PRIu32"\n",
+ kh_key(aux->id2tr, k), a->beg+1,a->beg+a->len, b->beg+1,b->beg+b->len);
+ }
+ if ( len%3 != 0 )
+ {
+ // There are 13k transcripts with incomplete 3' CDS. See for example ENST00000524289
+ // http://sep2015.archive.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?db=core;g=ENSG00000155868;r=5:157138846-157159019;t=ENST00000524289
+ // Also, the incomplete CDS can be too short (1 or 2bp), so it is not enough to trim the last one.
+
+ tr->trim |= TRIM_3PRIME;
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = tr->ncds - 1;
+ while ( i>=0 && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ len -= dlen;
+ i--;
+ }
+ }
+ else
+ {
+ i = 0;
+ while ( i<tr->ncds && len%3 )
+ {
+ int dlen = tr->cds[i]->len >= len%3 ? len%3 : tr->cds[i]->len;
+ tr->cds[i]->len -= dlen;
+ tr->cds[i]->beg += dlen;
+ len -= dlen;
+ i++;
+ }
+ }
+ }
+
+ // set CDS offsets and insert into regidx
+ len=0;
+ for (i=0; i<tr->ncds; i++)
+ {
+ tr->cds[i]->pos = len;
+ len += tr->cds[i]->len;
+ regidx_push(args->idx_cds, chr_beg,chr_end, tr->cds[i]->beg,tr->cds[i]->beg+tr->cds[i]->len-1, &tr->cds[i]);
+ }
+ }
+}
+
+void regidx_free_gf(void *payload) { free(*((gf_cds_t**)payload)); }
+void regidx_free_tscript(void *payload) { tscript_t *tr = *((tscript_t**)payload); free(tr->cds); free(tr); }
+
+void init_gff(args_t *args)
+{
+ aux_t *aux = &args->init;
+ aux->seq2int = khash_str2int_init(); // chrom's numeric id
+ aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
+ aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
+ args->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(tscript_t*), NULL);
+ aux->ignored_biotypes = khash_str2int_init();
+
+ // parse gff
+ kstring_t str = {0,0,0};
+ htsFile *fp = hts_open(args->gff_fname,"r");
+ if ( !fp ) error("Failed to read %s\n", args->gff_fname);
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ hts_expand(ftr_t, aux->nftr+1, aux->mftr, aux->ftr);
+ int ret = gff_parse(args, str.s, aux->ftr + aux->nftr);
+ if ( !ret ) aux->nftr++;
+ }
+ free(str.s);
+ if ( hts_close(fp)!=0 ) error("Close failed: %s\n", args->gff_fname);
+
+
+ // process gff information: connect CDS and exons to transcripts
+ args->idx_cds = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_cds_t*), NULL);
+ args->idx_utr = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_utr_t*), NULL);
+ args->idx_exon = regidx_init(NULL, NULL, regidx_free_gf, sizeof(gf_exon_t*), NULL);
+ args->itr = regitr_init(NULL);
+
+ int i;
+ for (i=0; i<aux->nftr; i++)
+ {
+ ftr_t *ftr = &aux->ftr[i];
+
+ // check whether to keep this feature: is there a mapping trid -> gene_id -> gene?
+ khint_t k = kh_get(int2tscript, aux->id2tr, (int)ftr->trid);
+ if ( k==kh_end(aux->id2tr) ) continue; // no such transcript
+
+ tscript_t *tr = kh_val(aux->id2tr,k);
+ if ( !tr->gene->name )
+ {
+ // not a supported biotype (e.g. gene:pseudogene, transcript:processed_transcript)
+ regidx_free_tscript(&tr);
+ kh_del(int2tscript, aux->id2tr,k);
+ continue;
+ }
+
+ // populate regidx by category:
+ // ftr->type .. GF_CDS, GF_EXON, GF_UTR3, GF_UTR5
+ // gene->type .. GF_PROTEIN_CODING, GF_MT_rRNA, GF_IG_C, ...
+ if ( ftr->type==GF_CDS ) register_cds(args, ftr);
+ else if ( ftr->type==GF_EXON ) register_exon(args, ftr);
+ else if ( ftr->type==GF_UTR5 ) register_utr(args, ftr);
+ else if ( ftr->type==GF_UTR3 ) register_utr(args, ftr);
+ else
+ error("something: %s\t%d\t%d\t%s\t%s\n", aux->seq[ftr->iseq],ftr->beg+1,ftr->end+1,ENSID(ftr->trid),gf_type2gff_string(ftr->type));
+ }
+ tscript_init_cds(args);
+
+ if ( !args->quiet )
+ {
+ fprintf(pysam_stderr,"Indexed %d transcripts, %d exons, %d CDSs, %d UTRs\n",
+ regidx_nregs(args->idx_tscript),
+ regidx_nregs(args->idx_exon),
+ regidx_nregs(args->idx_cds),
+ regidx_nregs(args->idx_utr));
+ }
+
+ free(aux->ftr);
+ khash_str2int_destroy_free(aux->seq2int);
+ // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
+ kh_destroy(int2tscript,aux->id2tr);
+ free(aux->seq);
+
+ if ( args->quiet<2 && khash_str2int_size(aux->ignored_biotypes) )
+ {
+ khash_t(str2int) *ign = (khash_t(str2int)*)aux->ignored_biotypes;
+ fprintf(pysam_stderr,"Ignored the following biotypes:\n");
+ for (i = kh_begin(ign); i < kh_end(ign); i++)
+ {
+ if ( !kh_exist(ign,i)) continue;
+ fprintf(pysam_stderr,"\t%dx\t.. %s\n", kh_value(ign,i), kh_key(ign,i));
+ }
+ }
+ khash_str2int_destroy_free(aux->ignored_biotypes);
+}
+
+void init_data(args_t *args)
+{
+ args->nfmt_bcsq = 1 + (args->ncsq_max - 1) / 32;
+
+ if ( !args->quiet ) fprintf(pysam_stderr,"Parsing %s ...\n", args->gff_fname);
+ init_gff(args);
+
+ args->rid = -1;
+
+ if ( args->filter_str )
+ args->filter = filter_init(args->hdr, args->filter_str);
+
+ args->fai = fai_load(args->fa_fname);
+ if ( !args->fai ) error("Failed to load the fai index: %s\n", args->fa_fname);
+
+ args->pos2vbuf = kh_init(pos2vbuf);
+ args->active_tr = khp_init(trhp);
+ args->hap = (hap_t*) calloc(1,sizeof(hap_t));
+
+ // init samples
+ if ( !bcf_hdr_nsamples(args->hdr) ) args->phase = PHASE_DROP_GT;
+ if ( args->sample_list && !strcmp("-",args->sample_list) )
+ {
+ // ignore all samples
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ // significant speedup for plain VCFs
+ bcf_hdr_set_samples(args->hdr,NULL,0);
+ }
+ args->phase = PHASE_DROP_GT;
+ }
+ else
+ args->smpl = smpl_ilist_init(args->hdr, args->sample_list, args->sample_is_file, SMPL_STRICT);
+ args->hdr_nsmpl = args->phase==PHASE_DROP_GT ? 0 : bcf_hdr_nsamples(args->hdr);
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ args->out = args->output_fname ? fopen(args->output_fname,"w") : pysam_stdout;
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname,strerror(errno));
+
+ fprintf(args->out,"# This file was produced by: bcftools +csq(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ fprintf(args->out,"# The command line was:\tbcftools +%s", args->argv[0]);
+ int i;
+ for (i=1; i<args->argc; i++)
+ fprintf(args->out," %s",args->argv[i]);
+ fprintf(args->out,"\n");
+ fprintf(args->out,"# LOG\t[2]Message\n");
+ fprintf(args->out,"# CSQ"); i = 1;
+ fprintf(args->out,"\t[%d]Sample", ++i);
+ fprintf(args->out,"\t[%d]Haplotype", ++i);
+ fprintf(args->out,"\t[%d]Chromosome", ++i);
+ fprintf(args->out,"\t[%d]Position", ++i);
+ fprintf(args->out,"\t[%d]Consequence", ++i);
+ fprintf(args->out,"\n");
+ }
+ else
+ {
+ args->out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
+ if ( args->out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
+ bcf_hdr_append_version(args->hdr,args->argc,args->argv,"bcftools/csq");
+ bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=.,Type=String,Description=\"%s consequence annotation from BCFtools/csq. Format: '[*]consequence|gene|transcript|biotype[|strand|amino_acid_change|dna_change]' or, for consequences of variants split across multiple sites, a pointer to the record storing the consequences '@position'. '*' prefix indicates a consequence downstream from a stop \">",args->bcsq_tag, args->local_csq ? "Local" : "Haplotype-aware");
+ if ( args->hdr_nsmpl )
+ bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
+ bcf_hdr_write(args->out_fh, args->hdr);
+ }
+ if ( !args->quiet ) fprintf(pysam_stderr,"Calling...\n");
+}
+
+void destroy_data(args_t *args)
+{
+ regidx_destroy(args->idx_cds);
+ regidx_destroy(args->idx_utr);
+ regidx_destroy(args->idx_exon);
+ regidx_destroy(args->idx_tscript);
+ regitr_destroy(args->itr);
+
+ khint_t k,i,j;
+ for (k=0; k<kh_end(args->init.gid2gene); k++)
+ {
+ if ( !kh_exist(args->init.gid2gene, k) ) continue;
+ gf_gene_t *gene = (gf_gene_t*) kh_val(args->init.gid2gene, k);
+ free(gene->name);
+ free(gene);
+ }
+ kh_destroy(int2gene,args->init.gid2gene);
+
+ if ( args->filter )
+ filter_destroy(args->filter);
+
+ khp_destroy(trhp,args->active_tr);
+ kh_destroy(pos2vbuf,args->pos2vbuf);
+ if ( args->smpl ) smpl_ilist_destroy(args->smpl);
+ int ret;
+ if ( args->out_fh )
+ ret = hts_close(args->out_fh);
+ else
+ ret = fclose(args->out);
+ if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"pysam_stdout");
+ for (i=0; i<args->vcf_rbuf.m; i++)
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ if ( !vbuf ) continue;
+ for (j=0; j<vbuf->m; j++)
+ {
+ if ( !vbuf->vrec[j] ) continue;
+ if ( vbuf->vrec[j]->line ) bcf_destroy(vbuf->vrec[j]->line);
+ free(vbuf->vrec[j]->smpl);
+ free(vbuf->vrec[j]->vcsq);
+ free(vbuf->vrec[j]);
+ }
+ free(vbuf->vrec);
+ free(vbuf);
+ }
+ free(args->vcf_buf);
+ free(args->rm_tr);
+ free(args->csq_buf);
+ free(args->hap->stack);
+ free(args->hap->sseq.s);
+ free(args->hap->tseq.s);
+ free(args->hap->tref.s);
+ free(args->hap);
+ fai_destroy(args->fai);
+ free(args->gt_arr);
+ free(args->str.s);
+ free(args->str2.s);
+ free(ENSID_FMT);
+}
+
+/*
+ The splice_* functions are for consquences around splice sites: start,stop,splice_*
+ */
+#define SPLICE_VAR_REF 0 // ref: ACGT>ACGT, csq not applicable, skip completely
+#define SPLICE_OUTSIDE 1 // splice acceptor or similar; csq set and is done, does not overlap the region
+#define SPLICE_INSIDE 2 // overlaps coding region; csq can be set but coding prediction is needed
+#define SPLICE_OVERLAP 3 // indel overlaps region boundary, csq set but could not determine csq
+typedef struct
+{
+ tscript_t *tr;
+ struct {
+ int32_t pos, rlen, alen;
+ char *ref, *alt;
+ bcf1_t *rec;
+ } vcf;
+ uint16_t check_acceptor:1, // check distance from exon start (fwd) or end (rev)
+ check_start:1, // this is the first coding exon (relative to transcript orientation), check first (fwd) or last (rev) codon
+ check_stop:1, // this is the last coding exon (relative to transcript orientation), check last (fwd) or first (rev) codon
+ check_donor:1, // as with check_acceptor
+ check_region_beg:1, // do/don't check for splices at this end, eg. in the first or last exon
+ check_region_end:1, //
+ check_utr:1, // check splice sites (acceptor/donor/region_*) only if not in utr
+ set_refalt:1; // set kref,kalt, if set, check also for synonymous events
+ uint32_t csq;
+ int tbeg, tend; // number of trimmed bases from beg and end of ref,alt allele
+ uint32_t ref_beg, // ref coordinates with spurious bases removed, ACC>AC can become AC>A or CC>C, whichever gives
+ ref_end; // a more conservative csq (the first and last base in kref.s)
+ kstring_t kref, kalt; // trimmed alleles, set only with SPLICE_OLAP
+}
+splice_t;
+void splice_init(splice_t *splice, bcf1_t *rec)
+{
+ memset(splice,0,sizeof(*splice));
+ splice->vcf.rec = rec;
+ splice->vcf.pos = rec->pos;
+ splice->vcf.rlen = rec->rlen;
+ splice->vcf.ref = rec->d.allele[0];
+}
+static inline void splice_build_hap(splice_t *splice, uint32_t beg, int len)
+{
+ // len>0 .. beg is the first base, del filled from right
+ // len<0 .. beg is the last base, del filled from left
+
+ int rlen, alen, rbeg, abeg; // first base to include (ref coordinates)
+ if ( len<0 )
+ {
+ rlen = alen = -len;
+ rbeg = beg - rlen + 1;
+ int dlen = splice->vcf.alen - splice->vcf.rlen;
+ if ( dlen<0 && beg < splice->ref_end ) // incomplete del, beg is in the middle
+ dlen += splice->ref_end - beg;
+ abeg = rbeg + dlen;
+ }
+ else
+ {
+ rbeg = abeg = beg;
+ rlen = alen = len;
+ // check for incomplete del as above??
+ }
+
+#define XDBG 0
+#if XDBG
+fprintf(pysam_stderr,"build_hap: rbeg=%d + %d abeg=%d \n",rbeg,rlen,abeg);
+#endif
+ splice->kref.l = 0;
+ splice->kalt.l = 0;
+
+ // add the part before vcf.ref, in the vcf.ref and after vcf.ref
+ int roff; // how many vcf.ref bases already used
+ if ( rbeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= rbeg ); // this can be extended thanks to N_REF_PAD
+ kputsn(splice->tr->ref + N_REF_PAD + rbeg - splice->tr->beg, splice->vcf.pos - rbeg, &splice->kref);
+ roff = 0;
+ }
+ else
+ roff = rbeg - splice->vcf.pos;
+#if XDBG
+fprintf(pysam_stderr,"r1: %s roff=%d\n",splice->kref.s,roff);
+#endif
+
+ if ( roff < splice->vcf.rlen && splice->kref.l < rlen )
+ {
+ int len = splice->vcf.rlen - roff; // len still available in vcf.ref
+ if ( len > rlen - splice->kref.l ) len = rlen - splice->kref.l; // how much of ref allele is still needed
+ kputsn(splice->vcf.ref + roff, len, &splice->kref);
+ }
+#if XDBG
+fprintf(pysam_stderr,"r2: %s\n",splice->kref.s);
+#endif
+
+ uint32_t end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kref.l < rlen )
+ {
+ if ( end + rlen - splice->kref.l - 1 > splice->tr->end ) // trim, the requested sequence is too long (could be extended, see N_REF_PAD)
+ rlen -= end + rlen - splice->kref.l - 1 - splice->tr->end;
+ if ( splice->kref.l < rlen )
+ kputsn(splice->tr->ref + N_REF_PAD + end - splice->tr->beg, rlen - splice->kref.l, &splice->kref);
+ }
+#if XDBG
+fprintf(pysam_stderr,"r3: %s\n",splice->kref.s);
+#endif
+
+
+ int aoff;
+ if ( abeg < splice->vcf.pos )
+ {
+ assert( splice->tr->beg <= abeg );
+ kputsn(splice->tr->ref + N_REF_PAD + abeg - splice->tr->beg, splice->vcf.pos - abeg, &splice->kalt);
+ aoff = 0;
+ }
+ else
+ aoff = abeg - splice->vcf.pos;
+#if XDBG
+fprintf(pysam_stderr,"a1: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ if ( aoff < splice->vcf.alen && splice->kalt.l < alen )
+ {
+ int len = splice->vcf.alen - aoff; // len still available in vcf.alt
+ if ( len > alen - splice->kalt.l ) len = alen - splice->kalt.l; // how much of alt allele is still needed
+ kputsn(splice->vcf.alt + aoff, len, &splice->kalt);
+ aoff -= len;
+ }
+ if ( aoff < 0 ) aoff = 0;
+ else aoff--;
+#if XDBG
+fprintf(pysam_stderr,"a2: %s aoff=%d\n",splice->kalt.s,aoff);
+#endif
+
+ end = splice->vcf.pos + splice->vcf.rlen; // position just after the ref allele
+ if ( splice->kalt.l < alen )
+ {
+ if ( end + alen + aoff - splice->kalt.l - 1 > splice->tr->end ) // trim, the requested sequence is too long
+ alen -= end + alen + aoff - splice->kalt.l - 1 - splice->tr->end;
+ if ( alen > 0 && alen > splice->kalt.l )
+ kputsn(splice->tr->ref + aoff + N_REF_PAD + end - splice->tr->beg, alen - splice->kalt.l, &splice->kalt);
+ }
+#if XDBG
+fprintf(pysam_stderr,"a3: %s\n",splice->kalt.s);
+fprintf(pysam_stderr," [%s]\n [%s]\n\n",splice->kref.s,splice->kalt.s);
+#endif
+}
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec);
+static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32_t trid)
+{
+ while ( regitr_overlap(itr) )
+ {
+ gf_utr_t *utr = regitr_payload(itr, gf_utr_t*);
+ tscript_t *tr = utr->tr;
+ if ( tr->id != trid ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ return csq.type.type;
+ }
+ return 0;
+}
+static inline void csq_stage_splice(args_t *args, bcf1_t *rec, tscript_t *tr, uint32_t type)
+{
+#if XDBG
+fprintf(pysam_stderr,"csq_stage_splice %d: type=%d\n",rec->pos+1,type);
+#endif
+ if ( !type ) return;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = type;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+}
+static inline int splice_csq_ins(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG, 1bp
+ // before and after the inserted bases
+ if ( splice->tbeg || splice->vcf.ref[0]!=splice->vcf.alt[0] )
+ {
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+ else
+ {
+ if ( splice->tend ) splice->tend--;
+ splice->ref_beg = splice->vcf.pos;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend;
+ }
+#if XDBG
+fprintf(pysam_stderr,"ins: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ int ret;
+ if ( splice->ref_beg >= ex_end ) // fully outside, beyond the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg+1,splice->ref_beg+1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_end ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ if ( splice->ref_end < ex_beg || (splice->ref_end == ex_beg && !splice->check_region_beg) ) // fully outside, before the exon
+ {
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_end-1,splice->ref_end-1, itr) ) // adjacent utr
+ {
+ ret = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ if ( ret!=0 )
+ {
+ regitr_destroy(itr);
+ return SPLICE_OUTSIDE; // overlaps utr
+ }
+ }
+ regitr_destroy(itr);
+ }
+ if ( !splice->check_region_beg ) return SPLICE_OUTSIDE;
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end > ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+ // overlaps the exon or inside the exon
+ // possible todo: find better alignment for frameshifting variants?
+ if ( splice->ref_beg <= ex_beg + 2 ) // in the first 3bp
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 2 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ // Make sure the variant will not end up left aligned to avoid overlapping vcf records
+ // splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ // splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ // if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ if ( splice->ref_beg < splice->vcf.pos ) // this must have been caused by too much trimming from right
+ {
+ int dlen = splice->vcf.pos - splice->ref_beg;
+ assert( dlen==1 );
+ splice->tbeg += dlen;
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) splice->tend -= dlen;
+ splice->ref_beg = splice->vcf.pos;
+ }
+ if ( splice->ref_end==ex_beg ) splice->tend--; // prevent zero-length ref allele
+ splice_build_hap(splice, splice->ref_beg, splice->vcf.alen - splice->tend - splice->tbeg + 1);
+ splice->vcf.rlen -= splice->tbeg + splice->tend - 1;
+ if ( splice->kref.l > splice->vcf.rlen ) { splice->kref.l = splice->vcf.rlen; splice->kref.s[splice->kref.l] = 0; }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_del(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // coordinates that matter for consequences, eg AC>ACG trimmed to C>CG
+ splice->ref_beg = splice->vcf.pos + splice->tbeg - 1; // 1b before the deleted base
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1; // the last deleted base
+
+#if XDBG
+fprintf(pysam_stderr,"del: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg + 1 < ex_beg ) // the part before the exon; ref_beg is off by -1
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ // filling from the left does not work for ENST00000341065/frame3.vcf
+ // CAG.GTGGCCAG CAG.GTGGCCAG
+ // CA-.--GGCCAG vs CAG.---GCCAG
+ // splice_build_hap(splice, ex_beg-1, -N_SPLICE_REGION_INTRON);
+ //
+ // filling from the right:
+ splice_build_hap(splice, ex_beg - N_SPLICE_REGION_INTRON, N_SPLICE_REGION_INTRON);
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos + 1;
+ splice->ref_beg = ex_beg - 1;
+ if ( splice->tbeg + splice->tend == splice->vcf.alen )
+ {
+ // the deletion overlaps ex_beg and cannot be easily realigned to the right
+ if ( !splice->tend )
+ {
+ splice->csq |= CSQ_CODING_SEQUENCE;
+ return SPLICE_OVERLAP;
+ }
+ splice->tend--;
+ }
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ char *ref = NULL, *alt = NULL;
+ if ( splice->set_refalt ) // seq identity is checked only when tr->ref is available
+ {
+ splice_build_hap(splice, ex_end+1, N_SPLICE_REGION_INTRON); // ref,alt positioned at the first intron base
+ ref = splice->kref.s, alt = splice->kalt.s;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ {
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( ref && alt && !strncmp(ref,alt,N_SPLICE_REGION_INTRON) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ if ( splice->ref_beg < ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ if ( ref && alt && !strncmp(ref+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,alt+N_SPLICE_REGION_INTRON-N_SPLICE_DONOR,N_SPLICE_DONOR) ) splice->csq |= CSQ_SYNONYMOUS_VARIANT;
+ }
+ }
+ }
+ if ( splice->ref_beg < ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg >= ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 2 ) // ref_beg is off by -1
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ if ( splice->tbeg>0 ) splice->tbeg--; //why is this?
+ if ( splice->vcf.rlen > splice->tbeg + splice->tend && splice->vcf.alen > splice->tbeg + splice->tend )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->vcf.alen -= splice->tbeg + splice->tend;
+ }
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
+ if ( (splice->ref_beg+1 < ex_beg && splice->ref_end >= ex_beg) || (splice->ref_beg+1 < ex_end && splice->ref_end >= ex_end) ) // ouch, ugly ENST00000409523/long-overlapping-del.vcf
+ {
+ splice->csq |= (splice->ref_end - splice->ref_beg + 1)%3 ? CSQ_FRAMESHIFT_VARIANT : CSQ_INFRAME_DELETION;
+ return SPLICE_OVERLAP;
+ }
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+
+static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ // not a real variant, can be ignored: eg ACGT>ACGT
+ if ( splice->tbeg + splice->tend == splice->vcf.rlen ) return SPLICE_VAR_REF;
+
+ splice->ref_beg = splice->vcf.pos + splice->tbeg;
+ splice->ref_end = splice->vcf.pos + splice->vcf.rlen - splice->tend - 1;
+
+#if XDBG
+fprintf(pysam_stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
+#endif
+
+ if ( splice->ref_beg < ex_beg ) // the part before the exon
+ {
+ if ( splice->check_region_beg )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,splice->ref_beg,ex_beg-1, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_end >= ex_beg - N_SPLICE_REGION_INTRON && splice->ref_beg < ex_beg - N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_end >= ex_beg - N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_end >= ex_beg )
+ {
+ splice->tbeg = splice->ref_beg - splice->vcf.pos;
+ splice->ref_beg = ex_beg;
+ }
+ }
+ if ( ex_end < splice->ref_end ) // the part after the exon
+ {
+ if ( splice->check_region_end )
+ {
+ int csq = 0;
+ if ( splice->check_utr )
+ {
+ regitr_t *itr = regitr_init(NULL);
+ const char *chr = bcf_seqname(args->hdr,splice->vcf.rec);
+ if ( regidx_overlap(args->idx_utr,chr,ex_end+1,splice->ref_end, itr) ) // adjacent utr
+ csq = csq_stage_utr(args, itr, splice->vcf.rec, splice->tr->id);
+ regitr_destroy(itr);
+ }
+ if ( !csq )
+ {
+ if ( splice->ref_beg <= ex_end + N_SPLICE_REGION_INTRON && splice->ref_end > ex_end + N_SPLICE_DONOR )
+ splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->ref_beg <= ex_end + N_SPLICE_DONOR )
+ {
+ if ( splice->check_donor && splice->tr->strand==STRAND_FWD ) splice->csq |= CSQ_SPLICE_DONOR;
+ if ( splice->check_acceptor && splice->tr->strand==STRAND_REV ) splice->csq |= CSQ_SPLICE_ACCEPTOR;
+ }
+ }
+ }
+ if ( splice->ref_beg <= ex_end )
+ {
+ splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
+ splice->ref_end = ex_end;
+ }
+ }
+ if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
+ {
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_OUTSIDE;
+ }
+
+ if ( splice->ref_beg < ex_beg + 3 )
+ {
+ if ( splice->check_region_beg ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_FWD ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->ref_end > ex_end - 3 )
+ {
+ if ( splice->check_region_end ) splice->csq |= CSQ_SPLICE_REGION;
+ if ( splice->tr->strand==STRAND_REV ) { if ( splice->check_start ) splice->csq |= CSQ_START_LOST; }
+ else { if ( splice->check_stop ) splice->csq |= CSQ_STOP_LOST; }
+ }
+ if ( splice->set_refalt )
+ {
+ splice->vcf.rlen -= splice->tbeg + splice->tend;
+ splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
+ splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
+ }
+ csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq);
+ return SPLICE_INSIDE;
+}
+static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
+{
+ splice->csq = 0;
+ splice->vcf.alen = strlen(splice->vcf.alt);
+
+ int rlen1 = splice->vcf.rlen - 1, alen1 = splice->vcf.alen - 1, i = 0;
+ splice->tbeg = 0, splice->tend = 0;
+
+ // trim from the right, then from the left
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[rlen1-i] != splice->vcf.alt[alen1-i] ) break;
+ i++;
+ }
+ splice->tend = i;
+ rlen1 -= i, alen1 -= i, i = 0;
+ while ( i<=rlen1 && i<=alen1 )
+ {
+ if ( splice->vcf.ref[i] != splice->vcf.alt[i] ) break;
+ i++;
+ }
+ splice->tbeg = i;
+
+ // The mnp, ins and del code was split into near-identical functions for clarity and debugging;
+ // possible todo: generalize once stable
+ if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
+ if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
+
+ return 0;
+}
+
+// return value: 0 added, 1 overlapping variant, 2 silent discard (intronic,alt=ref)
+int hap_init(args_t *args, hap_node_t *parent, hap_node_t *child, gf_cds_t *cds, bcf1_t *rec, int ial)
+{
+ int i;
+ kstring_t str = {0,0,0};
+ tscript_t *tr = cds->tr;
+ child->icds = cds->icds; // index of cds in the tscript's list of exons
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.tr = tr;
+ splice.vcf.alt = rec->d.allele[ial];
+ splice.check_acceptor = splice.check_donor = splice.set_refalt = splice.check_utr = 1;
+ if ( !(tr->trim & TRIM_5PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==0 ) splice.check_start = 1; }
+ else { if ( child->icds==tr->ncds-1 ) splice.check_start = 1; }
+ }
+ if ( !(tr->trim & TRIM_3PRIME) )
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( child->icds==tr->ncds-1 ) splice.check_stop = 1; }
+ else { if ( child->icds==0 ) splice.check_stop = 1; }
+ }
+ if ( splice.check_start ) // do not check starts in incomplete CDS, defined as not starting with M
+ {
+ if ( tr->strand==STRAND_FWD ) { if ( dna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg) != 'M' ) splice.check_start = 0; }
+ else { if ( cdna2aa(tr->ref+N_REF_PAD+cds->beg-tr->beg+cds->len-3) != 'M' ) splice.check_start = 0; }
+ }
+ if ( child->icds!=0 ) splice.check_region_beg = 1;
+ if ( child->icds!=tr->ncds-1 ) splice.check_region_end = 1;
+
+#if XDBG
+fprintf(pysam_stderr,"\n%d [%s][%s] check start:%d,stop:%d\n",splice.vcf.pos+1,splice.vcf.ref,splice.vcf.alt,splice.check_start,splice.check_stop);
+#endif
+ int ret = splice_csq(args, &splice, cds->beg, cds->beg + cds->len - 1);
+#if XDBG
+fprintf(pysam_stderr,"cds splice_csq: %d [%s][%s] .. beg,end=%d %d, ret=%d, csq=%d\n\n",splice.vcf.pos+1,splice.kref.s,splice.kalt.s,splice.ref_beg+1,splice.ref_end+1,ret,splice.csq);
+#endif
+
+ if ( ret==SPLICE_VAR_REF ) return 2; // not a variant, eg REF=CA ALT=CA
+ if ( ret==SPLICE_OUTSIDE || ret==SPLICE_OVERLAP ) // not a coding csq
+ {
+ free(splice.kref.s);
+ free(splice.kalt.s);
+
+ if ( !splice.csq ) return 2; // fully intronic, no csq
+
+ // splice_region/acceptor/donor
+ child->seq = NULL;
+ child->sbeg = 0;
+ child->rbeg = rec->pos;
+ child->rlen = 0;
+ child->dlen = 0;
+ kputs(rec->d.allele[0],&str);
+ kputc('>',&str);
+ kputs(rec->d.allele[ial],&str);
+ child->var = str.s;
+ child->type = HAP_SSS;
+ child->csq = splice.csq;
+ child->prev = parent->type==HAP_SSS ? parent->prev : parent;
+ child->rec = rec;
+ return 0;
+ }
+ if ( splice.csq & CSQ_SYNONYMOUS_VARIANT ) splice.csq &= ~CSQ_SYNONYMOUS_VARIANT; // synonymous&splice,frame could become synonymous&frame,splice
+
+ int dbeg = 0;
+ if ( splice.ref_beg < cds->beg )
+ {
+ // The vcf record overlaps the exon boundary, but the variant itself
+ // should fit inside since we are here. This will need more work.
+ // #1475227917
+ dbeg = cds->beg - splice.ref_beg;
+ splice.kref.l -= dbeg;
+ splice.ref_beg = cds->beg;
+ assert( dbeg <= splice.kalt.l );
+ }
+
+ if ( parent->type==HAP_SSS ) parent = parent->prev;
+ if ( parent->type==HAP_CDS )
+ {
+ i = parent->icds;
+ if ( i!=cds->icds )
+ {
+ // the variant is on a new exon, finish up the previous
+ int len = tr->cds[i]->len - parent->rbeg - parent->rlen + tr->cds[i]->beg;
+ if ( len > 0 )
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+
+ // append any skipped non-variant exons
+ while ( ++i < cds->icds )
+ kputsn_(tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len, &str);
+
+ if ( parent->icds==child->icds )
+ {
+ int len = splice.ref_beg - parent->rbeg - parent->rlen;
+ if ( len < 0 ) // overlapping variants
+ {
+ free(str.s);
+ return 1;
+ }
+ kputsn_(tr->ref + N_REF_PAD + parent->rbeg + parent->rlen - tr->beg, len, &str);
+ }
+ else
+ kputsn_(tr->ref + N_REF_PAD + cds->beg - tr->beg, splice.ref_beg - cds->beg, &str);
+ }
+ kputs(splice.kalt.s + dbeg, &str);
+
+ child->seq = str.s;
+ child->sbeg = cds->pos + (splice.ref_beg - cds->beg);
+ child->rbeg = splice.ref_beg;
+ child->rlen = splice.kref.l;
+ child->type = HAP_CDS;
+ child->prev = parent;
+ child->rec = rec;
+ child->csq = splice.csq;
+
+ // set vlen and the "ref>alt" string
+ {
+ int rlen = strlen(rec->d.allele[0]);
+ int alen = strlen(rec->d.allele[ial]);
+ child->dlen = alen - rlen;
+ child->var = (char*) malloc(rlen+alen+2);
+ memcpy(child->var,rec->d.allele[0],rlen);
+ child->var[rlen] = '>';
+ memcpy(child->var+rlen+1,rec->d.allele[ial],alen);
+ child->var[rlen+alen+1] = 0;
+ }
+
+ // yuck, the whole CDS is modified/deleted, not ready for this, todo.
+ if ( child->rbeg + child->rlen > cds->beg + cds->len )
+ {
+ child->type = HAP_SSS;
+ if ( !child->csq ) child->csq |= CSQ_CODING_SEQUENCE; // hack, specifically for ENST00000390520/deletion-overlap.vcf
+ }
+
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return 0;
+}
+void hap_destroy(hap_node_t *hap)
+{
+ int i;
+ for (i=0; i<hap->nchild; i++)
+ if ( hap->child[i] ) hap_destroy(hap->child[i]);
+ for (i=0; i<hap->mcsq_list; i++) free(hap->csq_list[i].type.vstr.s);
+ free(hap->csq_list);
+ free(hap->child);
+ free(hap->cur_child);
+ free(hap->seq);
+ free(hap->var);
+ free(hap);
+}
+
+
+/*
+ ref: spliced reference and its length (ref.l)
+ seq: part of the spliced query transcript on the reference strand to translate, its
+ length (seq.l) and the total length of the complete transcript (seq.m)
+ sbeg: seq offset within the spliced query transcript
+ rbeg: seq offset within ref, 0-based
+ rend: last base of seq within ref, plus one. If seq does not contain indels, it is rend=rbeg+seq->l
+ strand: coding strand - 0:rev, 1:fwd
+ tseq: translated sequence (aa)
+ fill: frameshift, fill until the end (strand=fwd) or from the start (strand=rev)
+ */
+void cds_translate(kstring_t *_ref, kstring_t *_seq, uint32_t sbeg, uint32_t rbeg, uint32_t rend, int strand, kstring_t *tseq, int fill)
+{
+#if XDBG
+fprintf(pysam_stderr,"translate: %d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+#endif
+ char tmp[3], *codon, *end;
+ int i, len, npad;
+
+ kstring_t ref = *_ref;
+ kstring_t seq = *_seq;
+
+ tseq->l = 0;
+ if ( !seq.l )
+ {
+ kputc('?', tseq);
+ return;
+ }
+
+#define DBG 0
+#if DBG
+ fprintf(pysam_stderr,"translate: sbeg,rbeg,rend=%d %d %d fill=%d seq.l=%d\n",sbeg,rbeg,rend,fill,(int)_seq->l);
+ fprintf(pysam_stderr," ref: l=%d %s\n", (int)ref.l,ref.s);
+ fprintf(pysam_stderr," seq: l=%d m=%d ", (int)seq.l,(int)seq.m);
+ for (i=0; i<seq.l; i++) fprintf(pysam_stderr,"%c",seq.s[i]); fprintf(pysam_stderr,"\n");
+ fprintf(pysam_stderr," sbeg,rbeg,rend: %d,%d,%d\n", sbeg,rbeg,rend);
+ fprintf(pysam_stderr," strand,fill: %d,%d\n", strand,fill);
+#endif
+
+ if ( strand==STRAND_FWD )
+ {
+ // left padding
+ npad = sbeg % 3;
+#if DBG>1
+ fprintf(pysam_stderr," npad: %d\n",npad);
+#endif
+ assert( npad<=rbeg );
+
+ for (i=0; i<npad; i++)
+ tmp[i] = ref.s[rbeg+i-npad+N_REF_PAD];
+ for (; i<3 && i-npad<seq.l; i++)
+ tmp[i] = seq.s[i-npad];
+ len = seq.l - i + npad; // the remaining length of padded sseq
+#if DBG>1
+ fprintf(pysam_stderr,"\t i=%d\n", i);
+#endif
+ if ( i==3 )
+ {
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[1]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ codon = seq.s + 3 - npad; // next codon
+ end = codon + len - 1 - (len % 3); // last position of a valid codon
+ while ( codon < end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[2]%c%c%c\n",codon[0],codon[1],codon[2]);
+#endif
+ codon += 3;
+ }
+ end = seq.s + seq.l - 1;
+ for (i=0; codon+i<=end; i++) tmp[i] = codon[i];
+ }
+
+ // right padding
+ codon = ref.s + rend + N_REF_PAD;
+ if ( i>0 )
+ {
+#if DBG>1
+ if(i==1)fprintf(pysam_stderr,"[3]%c\n",tmp[0]);
+ if(i==2)fprintf(pysam_stderr,"[3]%c%c\n",tmp[0],tmp[1]);
+#endif
+ for (; i<3; i++)
+ {
+ tmp[i] = *codon;
+ codon++;
+ }
+ kputc_(dna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[4]%c%c%c\n",tmp[0],tmp[1],tmp[2]);
+#endif
+ }
+ if ( fill!=0 )
+ {
+ end = ref.s + ref.l - N_REF_PAD;
+ while ( codon+3 <= end )
+ {
+ kputc_(dna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],dna2aa(codon));
+#endif
+ codon += 3;
+ }
+ }
+ }
+ else // STRAND_REV
+ {
+ // right padding - number of bases to take from ref
+ npad = (seq.m - (sbeg + seq.l)) % 3;
+#if DBG>1
+ fprintf(pysam_stderr," npad: %d\n",npad);
+#endif
+if ( !(npad>=0 && sbeg+seq.l+npad<=seq.m) ) fprintf(pysam_stderr,"sbeg=%d seq.l=%d seq.m=%d\n",sbeg,(int)seq.l,(int)seq.m);
+ assert( npad>=0 && sbeg+seq.l+npad<=seq.m ); // todo: first codon on the rev strand
+
+ if ( npad==2 )
+ {
+ tmp[1] = ref.s[rend+N_REF_PAD];
+ tmp[2] = ref.s[rend+N_REF_PAD+1];
+ i = 0;
+ }
+ else if ( npad==1 )
+ {
+ tmp[2] = ref.s[rend+N_REF_PAD];
+ i = 1;
+ }
+ else
+ i = 2;
+
+ end = seq.s + seq.l;
+ for (; i>=0 && end>seq.s; i--) tmp[i] = *(--end);
+#if DBG>1
+ fprintf(pysam_stderr,"\t i=%d\n", i);
+ if(i==1)fprintf(pysam_stderr,"[0] %c\n",tmp[2]);
+ if(i==0)fprintf(pysam_stderr,"[0] %c%c\n",tmp[1],tmp[2]);
+#endif
+ if ( i==-1 )
+ {
+#if DBG>1
+ fprintf(pysam_stderr,"[1]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2], cdna2aa(tmp));
+#endif
+ kputc_(cdna2aa(tmp), tseq);
+ codon = end - 3;
+ while ( codon >= seq.s )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[2]%c%c%c\t%c\n",codon[0],codon[1],codon[2], cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ if ( seq.s-codon==2 )
+ {
+ tmp[2] = seq.s[0];
+ i = 1;
+ }
+ else if ( seq.s-codon==1 )
+ {
+ tmp[1] = seq.s[0];
+ tmp[2] = seq.s[1];
+ i = 0;
+ }
+ else
+ i = -1;
+#if DBG>1
+ if(i==1)fprintf(pysam_stderr,"[3] %c\n",tmp[2]);
+ if(i==0)fprintf(pysam_stderr,"[3] %c%c\n",tmp[1],tmp[2]);
+#endif
+ }
+ // left padding
+ end = ref.s + N_REF_PAD + rbeg;
+ if ( i>=0 )
+ {
+ for (; i>=0 && end>=ref.s; i--) tmp[i] = *(--end);
+ kputc_(cdna2aa(tmp), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[4]%c%c%c\t%c\n",tmp[0],tmp[1],tmp[2],cdna2aa(tmp));
+#endif
+ }
+ if ( fill!=0 )
+ {
+ codon = end - 3;
+ while ( codon >= ref.s + N_REF_PAD )
+ {
+ kputc_(cdna2aa(codon), tseq);
+#if DBG>1
+ fprintf(pysam_stderr,"[5]%c%c%c\t%c\n",codon[0],codon[1],codon[2],cdna2aa(codon));
+#endif
+ codon -= 3;
+ }
+ }
+ }
+ kputc_(0,tseq); tseq->l--;
+#if DBG
+ fprintf(pysam_stderr," tseq: %s\n", tseq->s);
+#endif
+}
+
+void tscript_splice_ref(tscript_t *tr)
+{
+ int i, len = 0;
+ for (i=0; i<tr->ncds; i++)
+ len += tr->cds[i]->len;
+
+ tr->nsref = len + 2*N_REF_PAD;
+ tr->sref = (char*) malloc(len + 1 + 2*N_REF_PAD);
+ len = 0;
+
+ memcpy(tr->sref, tr->ref + tr->cds[0]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ for (i=0; i<tr->ncds; i++)
+ {
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[i]->beg - tr->beg, tr->cds[i]->len);
+ len += tr->cds[i]->len;
+ }
+ memcpy(tr->sref + len, tr->ref + N_REF_PAD + tr->cds[tr->ncds-1]->beg - tr->beg, N_REF_PAD);
+ len += N_REF_PAD;
+
+ tr->sref[len] = 0;
+}
+
+// returns: 0 if consequence was added, 1 if it already exists or could not be added
+int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+#if XDBG
+fprintf(pysam_stderr,"csq_push: %d .. %d\n",rec->pos+1,csq->type.type);
+#endif
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
+ vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
+ if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+
+ int i;
+ for (i=0; i<vbuf->n; i++)
+ if ( vbuf->vrec[i]->line==rec ) break;
+ if ( i==vbuf->n ) error("This should not happen.. %s:%d %s\n", bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr);
+ vrec_t *vrec = vbuf->vrec[i];
+
+ // if the variant overlaps donor/acceptor and also splice region, report only donor/acceptor
+ if ( csq->type.type & CSQ_SPLICE_REGION && csq->type.type & (CSQ_SPLICE_DONOR|CSQ_SPLICE_ACCEPTOR) )
+ csq->type.type &= ~CSQ_SPLICE_REGION;
+
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ // Same as below, to avoid records like
+ // 3630 .. @3632,stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // 3632 .. stop_lost|AL627309.1|ENST00000423372|protein_coding|-|260*>260G|3630T>A+3632A>C
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i] = csq->type;
+ goto exit_duplicate;
+ }
+ if ( !(vrec->vcsq[i].type & CSQ_PRINTED_UPSTREAM) ) continue;
+ if ( csq->type.ref != vrec->vcsq[i].ref ) continue;
+ goto exit_duplicate;
+ }
+ }
+ else if ( csq->type.type & CSQ_COMPOUND )
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT ) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( csq->type.gene != vrec->vcsq[i].gene ) continue;
+ if ( csq->type.vstr.s || vrec->vcsq[i].vstr.s )
+ {
+ // This is a bit hacky, but we want a simpler and more predictable output. The splice_csq() function
+ // can trigger stop/start events based on indel overlap, then another stop/start event can be triggered
+ // from add_csq() or test_cds_local() based on sequence comparison, and on output we could find two
+ // consequences:
+ // stop_lost|AL627309.1|ENST00000423372|protein_coding|-
+ // stop_lost&inframe_insertion|AL627309.1|ENST00000423372|protein_coding|-|260*>260CL|3630T>TAAA
+ if ( !csq->type.vstr.s || !vrec->vcsq[i].vstr.s )
+ {
+ if ( csq->type.type&CSQ_START_STOP && vrec->vcsq[i].type&CSQ_START_STOP )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+
+ // remove stop_lost&synonymous if stop_retained set
+ if ( vrec->vcsq[i].type&CSQ_STOP_RETAINED )
+ vrec->vcsq[i].type &= ~(CSQ_STOP_LOST|CSQ_SYNONYMOUS_VARIANT);
+
+ if ( !vrec->vcsq[i].vstr.s ) vrec->vcsq[i].vstr = csq->type.vstr;
+ goto exit_duplicate;
+ }
+ continue;
+ }
+ if ( strcmp(csq->type.vstr.s,vrec->vcsq[i].vstr.s) ) continue;
+ }
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ }
+ else
+ {
+ for (i=0; i<vrec->nvcsq; i++)
+ {
+ if ( csq->type.trid != vrec->vcsq[i].trid && (csq->type.type|vrec->vcsq[i].type)&CSQ_PRN_TSCRIPT) continue;
+ if ( csq->type.biotype != vrec->vcsq[i].biotype ) continue;
+ if ( !(vrec->vcsq[i].type & CSQ_COMPOUND) )
+ {
+ vrec->vcsq[i].type |= csq->type.type;
+ goto exit_duplicate;
+ }
+ if ( vrec->vcsq[i].type==(vrec->vcsq[i].type|csq->type.type) ) goto exit_duplicate;
+ }
+ }
+ // no such csq yet in this vcf record
+ csq->vrec = vrec;
+ csq->idx = i;
+ vrec->nvcsq++;
+ hts_expand0(vcsq_t, vrec->nvcsq, vrec->mvcsq, vrec->vcsq);
+ vrec->vcsq[i] = csq->type;
+ return 0;
+
+exit_duplicate:
+ csq->vrec = vrec;
+ csq->idx = i;
+ return 1;
+}
+
+// soff .. position of the variant within the trimmed query transcript
+// sbeg .. position of the variant within the query transcript
+// rbeg .. position on the reference transcript (if there are no indels, then rbeg=send)
+// rpos .. VCF position
+#define node2soff(i) (hap->stack[i].slen - (hap->stack[i].node->rlen + hap->stack[i].node->dlen))
+#define node2sbeg(i) (hap->sbeg + node2soff(i))
+#define node2send(i) (hap->sbeg + hap->stack[i].slen)
+#define node2rbeg(i) (hap->stack[i].node->sbeg)
+#define node2rend(i) (hap->stack[i].node->sbeg + hap->stack[i].node->rlen)
+#define node2rpos(i) (hap->stack[i].node->rec->pos)
+
+void kput_vcsq(vcsq_t *csq, kstring_t *str)
+{
+ // Remove start/stop from incomplete CDS, but only if there is another
+ // consequence as something must be reported
+ if ( csq->type & CSQ_INCOMPLETE_CDS && (csq->type & ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS|CSQ_UPSTREAM_STOP)) ) csq->type &= ~(CSQ_START_STOP|CSQ_INCOMPLETE_CDS);
+
+ // Remove missense from start/stops
+ if ( csq->type & CSQ_START_STOP && csq->type & CSQ_MISSENSE_VARIANT ) csq->type &= ~CSQ_MISSENSE_VARIANT;
+
+ if ( csq->type & CSQ_PRINTED_UPSTREAM && csq->ref )
+ {
+ kputc_('@',str);
+ kputw(csq->ref->pos+1, str);
+ return;
+ }
+ if ( csq->type & CSQ_UPSTREAM_STOP )
+ kputc_('*',str);
+
+ int i, n = sizeof(csq_strings)/sizeof(char*);
+ for (i=1; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputs(csq_strings[i],str); break; }
+ i++;
+ for (; i<n; i++)
+ if ( csq_strings[i] && csq->type&(1<<i) ) { kputc_('&',str); kputs(csq_strings[i],str); }
+
+ kputc_('|', str);
+ if ( csq->gene ) kputs(csq->gene , str);
+
+ kputc_('|', str);
+ if ( csq->type & CSQ_PRN_TSCRIPT ) ksprintf(str, "%s",ENSID(csq->trid));
+
+ kputc_('|', str);
+ kputs(gf_type2gff_string(csq->biotype), str);
+
+ if ( CSQ_PRN_STRAND(csq->type) || csq->vstr.l )
+ kputs(csq->strand==STRAND_FWD ? "|+" : "|-", str);
+
+ if ( csq->vstr.l )
+ kputs(csq->vstr.s, str);
+}
+
+void hap_add_csq(args_t *args, hap_t *hap, hap_node_t *node, int tlen, int ibeg, int iend, int dlen, int indel)
+{
+ int i;
+ tscript_t *tr = hap->tr;
+ int ref_node = tr->strand==STRAND_FWD ? ibeg : iend;
+
+ int icsq = node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *csq = &node->csq_list[icsq];
+ csq->pos = hap->stack[ref_node].node->rec->pos;
+ csq->type.trid = tr->id;
+ csq->type.gene = tr->gene->name;
+ csq->type.strand = tr->strand;
+ csq->type.biotype = tr->type;
+
+ // only now we see the translated sequence and can determine if the stop/start changes are real
+ int rm_csq = 0;
+ csq->type.type = 0;
+ for (i=ibeg; i<=iend; i++)
+ csq->type.type |= hap->stack[i].node->csq & CSQ_COMPOUND;
+ if ( dlen==0 && indel ) csq->type.type |= CSQ_INFRAME_ALTERING;
+
+ int has_upstream_stop = hap->upstream_stop;
+ if ( hap->stack[ibeg].node->type != HAP_SSS )
+ {
+ // check for truncating stops
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i]=='*' ) break;
+ if ( i!=hap->tref.l )
+ {
+ hap->tref.l = i+1;
+ hap->tref.s[i+1] = 0;
+ }
+ for (i=0; i<hap->tseq.l; i++)
+ if ( hap->tseq.s[i]=='*' ) break;
+ if ( i!=hap->tseq.l )
+ {
+ hap->tseq.l = i+1;
+ hap->tseq.s[i+1] = 0;
+ hap->upstream_stop = 1;
+ }
+ if ( csq->type.type & CSQ_STOP_LOST )
+ {
+ if ( hap->tref.s[hap->tref.l-1]=='*' && hap->tref.s[hap->tref.l-1] == hap->tseq.s[hap->tseq.l-1] )
+ {
+ rm_csq |= CSQ_STOP_LOST;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else if ( hap->tref.s[hap->tref.l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( hap->tseq.s[hap->tseq.l-1] == '*' )
+ {
+ rm_csq |= CSQ_STOP_GAINED;
+ csq->type.type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq->type.type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq->type.type & CSQ_START_LOST && hap->tref.s[0]!='M' )
+ {
+ rm_csq |= CSQ_START_LOST;
+ csq->type.type &= ~CSQ_START_LOST;
+ }
+ if ( dlen!=0 )
+ {
+ if ( dlen%3 )
+ csq->type.type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( dlen<0 )
+ csq->type.type |= CSQ_INFRAME_DELETION;
+ else
+ csq->type.type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (i=0; i<hap->tref.l; i++)
+ if ( hap->tref.s[i] != hap->tseq.s[i] ) break;
+ if ( i==hap->tref.l )
+ csq->type.type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( hap->tref.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_LOST;
+ else if ( hap->tseq.s[i] == '*' )
+ csq->type.type |= CSQ_STOP_GAINED;
+ else
+ csq->type.type |= CSQ_MISSENSE_VARIANT;
+ }
+ }
+ if ( has_upstream_stop ) csq->type.type |= CSQ_UPSTREAM_STOP;
+ csq->type.type &= ~rm_csq;
+
+ if ( hap->stack[ibeg].node->type == HAP_SSS )
+ {
+ node->csq_list[icsq].type.type |= hap->stack[ibeg].node->csq & ~rm_csq;
+ node->csq_list[icsq].type.ref = hap->stack[ibeg].node->rec;
+ node->csq_list[icsq].type.biotype = tr->type;
+ csq_push(args, node->csq_list+icsq, hap->stack[ibeg].node->rec);
+ return;
+ }
+
+ kstring_t str = node->csq_list[icsq].type.vstr;
+ str.l = 0;
+
+ // create the aa variant string
+ int aa_rbeg = tr->strand==STRAND_FWD ? node2rbeg(ibeg)/3+1 : (hap->tr->nsref - 2*N_REF_PAD - node2rend(iend))/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node2sbeg(ibeg)/3+1 : (tlen - node2send(iend))/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(hap->tref.s, &str);
+ if ( !(csq->type.type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(hap->tseq.s, &str);
+ }
+ kputc_('|', &str);
+
+ // create the dna variant string and, in case of combined variants,
+ // insert silent CSQ_PRINTED_UPSTREAM variants
+ for (i=ibeg; i<=iend; i++)
+ {
+ if ( i>ibeg ) kputc_('+', &str);
+ kputw(node2rpos(i)+1, &str);
+ kputs(hap->stack[i].node->var, &str);
+ }
+ node->csq_list[icsq].type.vstr = str;
+ csq_push(args, node->csq_list+icsq, hap->stack[ref_node].node->rec);
+
+ for (i=ibeg; i<=iend; i++)
+ {
+ // csq are printed at one position only for combined variants, the rest is
+ // silent and references the first
+ if ( hap->stack[i].node->csq & ~CSQ_COMPOUND )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = hap->stack[i].node->csq & ~CSQ_COMPOUND & ~rm_csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.vstr.l = 0;
+ kputs(str.s,&tmp_csq->type.vstr);
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ if ( i!=ref_node && (node->csq_list[icsq].type.type & CSQ_COMPOUND || !(hap->stack[i].node->csq & ~CSQ_COMPOUND)) )
+ {
+ node->ncsq_list++;
+ hts_expand0(csq_t,node->ncsq_list,node->mcsq_list,node->csq_list);
+ csq_t *tmp_csq = &node->csq_list[node->ncsq_list - 1];
+ tmp_csq->pos = hap->stack[i].node->rec->pos;
+ tmp_csq->type.trid = tr->id;
+ tmp_csq->type.gene = tr->gene->name;
+ tmp_csq->type.strand = tr->strand;
+ tmp_csq->type.type = CSQ_PRINTED_UPSTREAM | hap->stack[i].node->csq;
+ tmp_csq->type.biotype = tr->type;
+ tmp_csq->type.ref = hap->stack[ref_node].node->rec;
+ tmp_csq->type.vstr.l = 0;
+ csq_push(args, tmp_csq, hap->stack[i].node->rec);
+ }
+ }
+}
+
+void hap_finalize(args_t *args, hap_t *hap)
+{
+ tscript_t *tr = hap->tr;
+ if ( !tr->sref )
+ tscript_splice_ref(tr);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ int istack = 0;
+ hts_expand(hstack_t,1,hap->mstack,hap->stack);
+
+ hap->sseq.l = 0;
+ hap->tseq.l = 0;
+ hap->stack[0].node = tr->root;
+ hap->stack[0].ichild = -1;
+ hap->stack[0].slen = 0;
+ hap->stack[0].dlen = 0;
+
+ while ( istack>=0 )
+ {
+ hstack_t *stack = &hap->stack[istack];
+ hap_node_t *node = hap->stack[istack].node;
+ while ( ++hap->stack[istack].ichild < node->nchild )
+ {
+ if ( node->child[stack->ichild] ) break;
+ }
+ if ( stack->ichild == node->nchild ) { istack--; continue; }
+
+ node = node->child[stack->ichild];
+
+ istack++;
+ hts_expand(hstack_t,istack+1,hap->mstack,hap->stack);
+ stack = &hap->stack[istack-1];
+
+ hap->stack[istack].node = node;
+ hap->stack[istack].ichild = -1;
+
+ hap->sseq.l = stack->slen;
+ if ( node->type==HAP_CDS ) kputs(node->seq, &hap->sseq);
+ hap->stack[istack].slen = hap->sseq.l;
+ hap->stack[istack].dlen = hap->stack[istack-1].dlen + node->dlen;
+
+ if ( !node->nend ) continue; // not a leaf node
+
+ // The spliced sequence has been built for the current haplotype and stored
+ // in hap->sseq. Now we break it and output as independent parts
+
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen; // total length of the spliced query transcript
+ hap->upstream_stop = 0;
+
+ int i = 1, dlen = 0, ibeg, indel = 0;
+ while ( i<istack && hap->stack[i].node->type == HAP_SSS ) i++;
+ hap->sbeg = hap->stack[i].node->sbeg;
+
+ if ( tr->strand==STRAND_FWD )
+ {
+ i = 0, ibeg = -1;
+ while ( ++i <= istack )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ // start/stop/splice site overlap: don't know how to build the haplotypes correctly, skipping
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i<istack )
+ {
+ if ( dlen%3 ) // frameshift
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = node2sbeg(i);
+ int inext = node2sbeg(i+1);
+ if ( icur/3 == inext/3 ) // in the same codon, can't be flushed yet
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+
+ int ioff = node2soff(ibeg);
+ int icur = node2sbeg(ibeg);
+ int rbeg = node2rbeg(ibeg);
+ int rend = node2rend(i);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[i].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(i) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,0, ibeg,i,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ else
+ {
+ i = istack + 1, ibeg = -1;
+ while ( --i > 0 )
+ {
+ if ( hap->stack[i].node->type == HAP_SSS )
+ {
+ hap_add_csq(args,hap,node,0,i,i,0,0);
+ continue;
+ }
+ dlen += hap->stack[i].node->dlen;
+ if ( hap->stack[i].node->dlen ) indel = 1;
+ if ( i>1 && hap->stack[i-1].node->type != HAP_SSS )
+ {
+ if ( dlen%3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ int icur = sseq.m - 1 - node2sbeg(i);
+ int inext = sseq.m - 1 - node2sbeg(i-1);
+ if ( icur/3 == inext/3 )
+ {
+ if ( ibeg==-1 ) ibeg = i;
+ continue;
+ }
+ }
+ if ( ibeg<0 ) ibeg = i;
+ int ioff = node2soff(i);
+ int icur = node2sbeg(i);
+ int rbeg = node2rbeg(i);
+ int rend = node2rend(ibeg);
+ int fill = dlen%3;
+
+ // alt
+ if ( hap->sseq.l )
+ {
+ sseq.l = hap->stack[ibeg].slen - ioff;
+ sseq.s = hap->sseq.s + ioff;
+ }
+ else // splice site overlap, see #1475227917
+ sseq.l = fill = 0;
+ cds_translate(&sref, &sseq, icur,rbeg,rend, tr->strand, &hap->tseq, fill);
+
+ // ref
+ sseq.l = node2rend(ibeg) - rbeg;
+ sseq.s = sref.s + N_REF_PAD + rbeg;
+ sseq.m = sref.m - 2*N_REF_PAD;
+ cds_translate(&sref, &sseq, rbeg,rbeg,rend, tr->strand, &hap->tref, fill);
+ sseq.m = sref.m - 2*N_REF_PAD + hap->stack[istack].dlen;
+
+ hap_add_csq(args,hap,node,sseq.m, i,ibeg,dlen,indel);
+ ibeg = -1;
+ dlen = 0;
+ indel = 0;
+ }
+ }
+ }
+}
+
+static inline void csq_print_text(args_t *args, csq_t *csq, int ismpl, int ihap)
+{
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+}
+static inline void hap_print_text(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list ) return;
+
+ char *smpl = ismpl >= 0 ? args->hdr->samples[ismpl] : "-";
+ const char *chr = bcf_hdr_id2name(args->hdr,args->rid);
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ if ( csq->type.type & CSQ_PRINTED_UPSTREAM ) continue;
+ assert( csq->type.vstr.l );
+
+ fprintf(args->out,"CSQ\t%s\t", smpl);
+ if ( ihap>0 )
+ fprintf(args->out,"%d", ihap);
+ else
+ fprintf(args->out,"-");
+
+ args->str.l = 0;
+ kput_vcsq(&csq->type, &args->str);
+ fprintf(args->out,"\t%s\t%d\t%s\n",chr,csq->pos+1,args->str.s);
+ }
+}
+
+static inline void hap_stage_vcf(args_t *args, tscript_t *tr, int ismpl, int ihap, hap_node_t *node)
+{
+ if ( !node || !node->ncsq_list || ismpl<0 ) return;
+
+ int i;
+ for (i=0; i<node->ncsq_list; i++)
+ {
+ csq_t *csq = node->csq_list + i;
+ vrec_t *vrec = csq->vrec;
+ int icsq = 2*csq->idx + ihap;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[ismpl*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+}
+
+void hap_flush(args_t *args, uint32_t pos)
+{
+ int i,j;
+ tr_heap_t *heap = args->active_tr;
+
+ while ( heap->ndat && heap->dat[0]->end<=pos )
+ {
+ tscript_t *tr = heap->dat[0];
+ khp_delete(trhp, heap);
+
+ args->hap->tr = tr;
+ if ( tr->root && tr->root->nchild ) // normal, non-localized calling
+ {
+ hap_finalize(args, args->hap);
+
+ if ( args->output_type==FT_TAB_TEXT ) // plain text output, not a vcf
+ {
+ if ( args->phase==PHASE_DROP_GT )
+ hap_print_text(args, tr, -1,0, tr->hap[0]);
+ else
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_print_text(args, tr, args->smpl->idx[i],j+1, tr->hap[i*2+j]);
+ }
+ }
+ }
+ else if ( args->phase!=PHASE_DROP_GT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ for (j=0; j<2; j++)
+ hap_stage_vcf(args, tr, args->smpl->idx[i],j, tr->hap[i*2+j]);
+ }
+ }
+ }
+
+ // mark the transcript for deletion. Cannot delete it immediately because
+ // by-position VCF output will need them when flushed by vcf_buf_push
+ args->nrm_tr++;
+ hts_expand(tscript_t*,args->nrm_tr,args->mrm_tr,args->rm_tr);
+ args->rm_tr[args->nrm_tr-1] = tr;
+ }
+}
+
+#define SWAP(type_t, a, b) { type_t t = a; a = b; b = t; }
+
+void vbuf_push(args_t *args, bcf1_t **rec_ptr)
+{
+ int i;
+
+ assert(rec_ptr);
+ bcf1_t *rec = *rec_ptr;
+
+ // check for duplicate records
+ i = args->vcf_rbuf.n ? rbuf_last(&args->vcf_rbuf) : -1;
+ if ( i<0 || args->vcf_buf[i]->vrec[0]->line->pos!=rec->pos )
+ {
+ // vcf record with a new pos
+ rbuf_expand0(&args->vcf_rbuf, vbuf_t*, args->vcf_rbuf.n+1, args->vcf_buf);
+ i = rbuf_append(&args->vcf_rbuf);
+ if ( !args->vcf_buf[i] ) args->vcf_buf[i] = (vbuf_t*) calloc(1,sizeof(vbuf_t));
+ args->vcf_buf[i]->n = 0;
+ }
+ vbuf_t *vbuf = args->vcf_buf[i];
+ vbuf->n++;
+ hts_expand0(vrec_t*, vbuf->n, vbuf->m, vbuf->vrec);
+ if ( !vbuf->vrec[vbuf->n - 1] )
+ vbuf->vrec[vbuf->n - 1] = (vrec_t*) calloc(1,sizeof(vrec_t));
+
+ vrec_t *vrec = vbuf->vrec[vbuf->n - 1];
+ if ( args->phase!=PHASE_DROP_GT && args->smpl->n )
+ {
+ if ( !vrec->smpl ) vrec->smpl = (uint32_t*) calloc(args->hdr_nsmpl,sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ else memset(vrec->smpl,0,args->hdr_nsmpl*sizeof(*vrec->smpl) * args->nfmt_bcsq);
+ }
+ if ( !vrec->line ) vrec->line = bcf_init1();
+ SWAP(bcf1_t*, (*rec_ptr), vrec->line);
+
+ int ret;
+ khint_t k = kh_put(pos2vbuf, args->pos2vbuf, (int)rec->pos, &ret);
+ kh_val(args->pos2vbuf,k) = vbuf;
+}
+
+void vbuf_flush(args_t *args)
+{
+ if ( args->active_tr->ndat ) return; // cannot output buffered VCF lines (args.vbuf) until all active transcripts are gone
+
+ int i,j;
+ while ( (i=rbuf_shift(&args->vcf_rbuf))>=0 )
+ {
+ vbuf_t *vbuf = args->vcf_buf[i];
+ for (i=0; i<vbuf->n; i++)
+ {
+ vrec_t *vrec = vbuf->vrec[i];
+ if ( !args->out_fh ) // not a VCF output
+ {
+ vrec->nvcsq = 0;
+ continue;
+ }
+ if ( !vrec->nvcsq )
+ {
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ continue;
+ }
+
+ args->str.l = 0;
+ kput_vcsq(&vrec->vcsq[0], &args->str);
+ for (j=1; j<vrec->nvcsq; j++)
+ {
+ kputc_(',', &args->str);
+ kput_vcsq(&vrec->vcsq[j], &args->str);
+ }
+ bcf_update_info_string(args->hdr, vrec->line, args->bcsq_tag, args->str.s);
+ if ( args->hdr_nsmpl )
+ {
+ if ( vrec->nfmt < args->nfmt_bcsq )
+ for (j=1; j<args->hdr_nsmpl; j++) memcpy(vrec->smpl+j*vrec->nfmt, vrec->smpl+j*args->nfmt_bcsq, vrec->nfmt*sizeof(*vrec->smpl));
+ bcf_update_format_int32(args->hdr, vrec->line, args->bcsq_tag, vrec->smpl, args->hdr_nsmpl*vrec->nfmt);
+ }
+ vrec->nvcsq = 0;
+ bcf_write(args->out_fh, args->hdr, vrec->line);
+ }
+ if ( vbuf->n )
+ {
+ khint_t k = kh_get(pos2vbuf, args->pos2vbuf, vbuf->vrec[0]->line->pos);
+ if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
+ }
+ vbuf->n = 0;
+ }
+
+ for (i=0; i<args->nrm_tr; i++)
+ {
+ tscript_t *tr = args->rm_tr[i];
+ if ( tr->root ) hap_destroy(tr->root);
+ tr->root = NULL;
+ free(tr->hap);
+ free(tr->ref);
+ free(tr->sref);
+ }
+ args->nrm_tr = 0;
+ args->ncsq_buf = 0;
+}
+
+void tscript_init_ref(args_t *args, tscript_t *tr, const char *chr)
+{
+ int i, len;
+ int pad_beg = tr->beg >= N_REF_PAD ? N_REF_PAD : tr->beg;
+
+ tr->ref = faidx_fetch_seq(args->fai, chr, tr->beg - pad_beg, tr->end + N_REF_PAD, &len);
+ if ( !tr->ref )
+ error("faidx_fetch_seq failed %s:%d-%d\n", chr,tr->beg+1,tr->end+1);
+
+ int pad_end = len - (tr->end - tr->beg + 1 + pad_beg);
+ if ( pad_beg + pad_end != 2*N_REF_PAD )
+ {
+ char *ref = (char*) malloc(tr->end - tr->beg + 1 + 2*N_REF_PAD);
+ for (i=0; i < N_REF_PAD - pad_beg; i++) ref[i] = 'N';
+ memcpy(ref+i, tr->ref, len);
+ for (i=0; i < N_REF_PAD - pad_end; i++) ref[i+len] = 'N';
+ free(tr->ref);
+ tr->ref = ref;
+ }
+}
+
+static void sanity_check_ref(args_t *args, tscript_t *tr, bcf1_t *rec)
+{
+ char *ref = tr->ref + (rec->pos + N_REF_PAD >= tr->beg ? rec->pos - tr->beg + N_REF_PAD : 0);
+ char *vcf = rec->d.allele[0] + (rec->pos + N_REF_PAD >= tr->beg ? 0 : tr->beg - N_REF_PAD - rec->pos);
+ assert( vcf - rec->d.allele[0] < strlen(rec->d.allele[0]) );
+ while ( *ref && *vcf )
+ {
+ if ( *ref!=*vcf && toupper(*ref)!=toupper(*vcf) )
+ error("Error: the fasta reference does not match the VCF REF allele at %s:%d .. %s\n", bcf_seqname(args->hdr,rec),rec->pos+1,rec->d.allele[0]);
+ ref++;
+ vcf++;
+ }
+}
+
+int test_cds_local(args_t *args, bcf1_t *rec)
+{
+ int i,j, ret = 0;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ // structures to fake the normal test_cds machinery
+ hap_node_t root, node;
+ root.type = HAP_ROOT;
+ kstring_t *tref = &args->hap->tref, *tseq = &args->hap->tseq;
+
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+
+ if ( !tr->ref )
+ {
+ tscript_init_ref(args, tr, chr);
+ tscript_splice_ref(tr);
+ khp_insert(trhp, args->active_tr, &tr); // only to clean the reference afterwards
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ kstring_t sref;
+ sref.s = tr->sref;
+ sref.l = tr->nsref;
+ sref.m = sref.l;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( hap_init(args, &root, &node, cds, rec, i)!=0 ) continue;
+
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+
+ int csq_type = node.csq;
+
+ // code repetition: it would be nice to reuse the code from hap_add_csq, needs have refactoring though
+ if ( node.type == HAP_SSS )
+ {
+ csq.type.type = csq_type;
+ csq_stage(args, &csq, rec);
+ }
+ else
+ {
+ kstring_t sseq;
+ sseq.m = sref.m - 2*N_REF_PAD + node.dlen;
+ sseq.s = node.seq;
+ int alen = sseq.l = strlen(sseq.s);
+ int fill = node.dlen%3 && alen ? 1 : 0; // see #1475227917
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tseq, fill);
+
+ sseq.m = sref.m - 2*N_REF_PAD;
+ sseq.s = sref.s + N_REF_PAD + node.sbeg;
+ sseq.l = node.rlen;
+ cds_translate(&sref, &sseq, node.sbeg,node.sbeg,node.sbeg+node.rlen, tr->strand, tref, fill);
+
+ // check for truncating stops
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j]=='*' ) break;
+ if ( j!=tref->l )
+ {
+ tref->l = j+1;
+ tref->s[j+1] = 0;
+ }
+ for (j=0; j<tseq->l; j++)
+ if ( tseq->s[j]=='*' ) break;
+ if ( j!=tseq->l )
+ {
+ tseq->l = j+1;
+ tseq->s[j+1] = 0;
+ }
+ if ( csq_type & CSQ_STOP_LOST )
+ {
+ if ( tref->s[tref->l-1]=='*' && tref->s[tref->l-1] == tseq->s[tseq->l-1] )
+ {
+ csq_type &= ~CSQ_STOP_LOST;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else if (tref->s[tref->l-1]!='*' )
+ {
+ // This is CDS 3' incomplete ENSG00000173376/synon.vcf, can also be missense
+ // We observe in real data a change to a stop, ENST00000528237/retained-stop-incomplete-cds.vcf
+ if ( tseq->s[tseq->l-1] == '*' )
+ {
+ csq_type &= ~CSQ_STOP_GAINED;
+ csq_type |= CSQ_STOP_RETAINED;
+ }
+ else
+ csq_type |= CSQ_INCOMPLETE_CDS;
+ }
+ }
+ if ( csq_type & CSQ_START_LOST && tref->s[0]!='M' )
+ csq_type &= ~CSQ_START_LOST;
+ if ( node.dlen!=0 )
+ {
+ if ( node.dlen%3 )
+ csq_type |= CSQ_FRAMESHIFT_VARIANT;
+ else if ( node.dlen<0 )
+ csq_type |= CSQ_INFRAME_DELETION;
+ else
+ csq_type |= CSQ_INFRAME_INSERTION;
+ }
+ else
+ {
+ for (j=0; j<tref->l; j++)
+ if ( tref->s[j] != tseq->s[j] ) break;
+ if ( j==tref->l )
+ csq_type |= CSQ_SYNONYMOUS_VARIANT;
+ else if ( tref->s[j] == '*' )
+ csq_type |= CSQ_STOP_LOST;
+ else if ( tseq->s[j] == '*' )
+ csq_type |= CSQ_STOP_GAINED;
+ else
+ csq_type |= CSQ_MISSENSE_VARIANT;
+ }
+ if ( csq_type & CSQ_COMPOUND )
+ {
+ // create the aa variant string
+ kstring_t str = {0,0,0};
+ int aa_rbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD - node.sbeg - node.rlen)/3+1;
+ int aa_sbeg = tr->strand==STRAND_FWD ? node.sbeg/3+1 : (tr->nsref - 2*N_REF_PAD + node.dlen - node.sbeg - alen)/3+1;
+ kputc_('|', &str);
+ kputw(aa_rbeg, &str);
+ kputs(tref->s, &str);
+ if ( !(csq_type & CSQ_SYNONYMOUS_VARIANT) )
+ {
+ kputc_('>', &str);
+ kputw(aa_sbeg, &str);
+ kputs(tseq->s, &str);
+ }
+ kputc_('|', &str);
+ kputw(rec->pos+1, &str);
+ kputs(node.var, &str);
+ csq.type.vstr = str;
+ csq.type.type = csq_type & CSQ_COMPOUND;
+ csq_stage(args, &csq, rec);
+
+ // all this only to clean vstr when vrec is flushed
+ if ( !tr->root )
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->root->ncsq_list++;
+ hts_expand0(csq_t,tr->root->ncsq_list,tr->root->mcsq_list,tr->root->csq_list);
+ csq_t *rm_csq = tr->root->csq_list + tr->root->ncsq_list - 1;
+ rm_csq->type.vstr = str;
+ }
+ if ( csq_type & ~CSQ_COMPOUND )
+ {
+ csq.type.type = csq_type & ~CSQ_COMPOUND;
+ csq.type.vstr.l = 0;
+ csq_stage(args, &csq, rec);
+ }
+ }
+ free(node.seq);
+ free(node.var);
+ }
+ }
+ return ret;
+}
+
+int test_cds(args_t *args, bcf1_t *rec)
+{
+ int i, ret = 0, hap_ret;
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_cds,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_cds_t *cds = regitr_payload(args->itr,gf_cds_t*);
+ tscript_t *tr = cds->tr;
+ if ( !GF_is_coding(tr->type) ) continue;
+ ret = 1;
+ if ( !tr->root )
+ {
+ // initialize the transcript and its haplotype tree, fetch the reference sequence
+ tscript_init_ref(args, tr, chr);
+
+ tr->root = (hap_node_t*) calloc(1,sizeof(hap_node_t));
+ tr->nhap = args->phase==PHASE_DROP_GT ? 1 : 2*args->smpl->n; // maximum ploidy = diploid
+ tr->hap = (hap_node_t**) malloc(tr->nhap*sizeof(hap_node_t*));
+ for (i=0; i<tr->nhap; i++) tr->hap[i] = NULL;
+ tr->root->nend = tr->nhap;
+ tr->root->type = HAP_ROOT;
+
+ khp_insert(trhp, args->active_tr, &tr);
+ }
+
+ sanity_check_ref(args, tr, rec);
+
+ if ( args->phase==PHASE_DROP_GT )
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ hap_node_t *parent = tr->hap[0] ? tr->hap[0] : tr->root;
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, 1))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ }
+ else ret = 1; // prevent reporting as intron in test_tscript
+ free(child);
+ continue;
+ }
+ parent->nend--;
+ parent->nchild = 1;
+ parent->mchild = 1;
+ parent->child = (hap_node_t**) malloc(sizeof(hap_node_t*));
+ parent->child[0] = child;
+ tr->hap[0] = child;
+ tr->hap[0]->nend = 1;
+ continue;
+ }
+
+ // apply the VCF variants and extend the haplotype tree
+ int j, ismpl, ihap, ngts = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ ngts /= bcf_hdr_nsamples(args->hdr);
+ if ( ngts!=1 && ngts!=2 )
+ {
+ if ( !args->quiet )
+ fprintf(pysam_stderr,"Warning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping site with non-diploid/non-haploid genotypes at %s:%d\t%s>%s\n", chr,rec->pos+1,rec->d.allele[0],rec->d.allele[1]);
+ continue;
+ }
+ for (ismpl=0; ismpl<args->smpl->n; ismpl++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[ismpl]*ngts;
+ if ( gt[0]==bcf_gt_missing ) continue;
+
+ if ( ngts>1 && gt[0]!=gt[1] && gt[1]!=bcf_gt_missing && gt[1]!=bcf_int32_vector_end )
+ {
+ if ( args->phase==PHASE_MERGE )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ }
+ if ( !bcf_gt_is_phased(gt[0]) && !bcf_gt_is_phased(gt[1]) )
+ {
+ if ( args->phase==PHASE_REQUIRE )
+ error("Unphased genotype at %s:%d, sample %s. See the --phase option.\n", chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]]);
+ if ( args->phase==PHASE_SKIP )
+ continue;
+ if ( args->phase==PHASE_NON_REF )
+ {
+ if ( !bcf_gt_allele(gt[0]) ) gt[0] = gt[1];
+ else if ( !bcf_gt_allele(gt[1]) ) gt[1] = gt[0];
+ }
+ }
+ }
+
+ for (ihap=0; ihap<ngts; ihap++)
+ {
+ if ( gt[ihap]==bcf_gt_missing || gt[ihap]==bcf_int32_vector_end ) continue;
+
+ i = 2*ismpl + ihap;
+
+ int ial = bcf_gt_allele(gt[ihap]);
+ if ( !ial ) continue;
+ assert( ial < rec->n_allele );
+ if ( rec->d.allele[ial][0]=='<' || rec->d.allele[ial][0]=='*' ) { continue; }
+
+ hap_node_t *parent = tr->hap[i] ? tr->hap[i] : tr->root;
+ if ( parent->cur_rec==rec && parent->cur_child[ial]>=0 )
+ {
+ // this haplotype has been seen in another sample
+ tr->hap[i] = parent->child[ parent->cur_child[ial] ];
+ tr->hap[i]->nend++;
+ parent->nend--;
+ continue;
+ }
+
+ hap_node_t *child = (hap_node_t*)calloc(1,sizeof(hap_node_t));
+ if ( (hap_ret=hap_init(args, parent, child, cds, rec, ial))!=0 )
+ {
+ // overlapping or intron variant, cannot apply
+ if ( hap_ret==1 )
+ {
+ if ( !args->quiet )
+ fprintf(pysam_stderr,"Warning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ if ( args->out )
+ fprintf(args->out,"LOG\tWarning: Skipping overlapping variants at %s:%d, sample %s\t%s>%s\n",
+ chr,rec->pos+1,args->hdr->samples[args->smpl->idx[ismpl]],rec->d.allele[0],rec->d.allele[ial]);
+ }
+ free(child);
+ continue;
+ }
+
+ if ( parent->cur_rec!=rec )
+ {
+ hts_expand(int,rec->n_allele,parent->mcur_child,parent->cur_child);
+ for (j=0; j<rec->n_allele; j++) parent->cur_child[j] = -1;
+ parent->cur_rec = rec;
+ }
+
+ j = parent->nchild++;
+ hts_expand0(hap_node_t*,parent->nchild,parent->mchild,parent->child);
+ parent->cur_child[ial] = j;
+ parent->child[j] = child;
+ tr->hap[i] = child;
+ tr->hap[i]->nend++;
+ parent->nend--;
+ }
+ }
+ }
+ return ret;
+}
+
+void csq_stage(args_t *args, csq_t *csq, bcf1_t *rec)
+{
+ // known issues: tab output leads to unsorted output. This is because
+ // coding haplotypes are printed in one go and buffering is not used
+ // with tab output. VCF output is OK though.
+ if ( csq_push(args, csq, rec)!=0 ) return; // the consequence already exists
+
+ int i,j,ngt = 0;
+ if ( args->phase!=PHASE_DROP_GT )
+ {
+ ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
+ if ( ngt>0 ) ngt /= bcf_hdr_nsamples(args->hdr);
+ }
+ if ( ngt<=0 )
+ {
+ if ( args->output_type==FT_TAB_TEXT )
+ csq_print_text(args, csq, -1,0);
+ return;
+ }
+ assert( ngt<=2 );
+
+ if ( args->output_type==FT_TAB_TEXT )
+ {
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+ csq_print_text(args, csq, args->smpl->idx[i],j+1);
+ }
+ }
+ return;
+ }
+
+ vrec_t *vrec = csq->vrec;
+ for (i=0; i<args->smpl->n; i++)
+ {
+ int32_t *gt = args->gt_arr + args->smpl->idx[i]*ngt;
+ for (j=0; j<ngt; j++)
+ {
+ if ( gt[j]==bcf_gt_missing || gt[j]==bcf_int32_vector_end || !bcf_gt_allele(gt[j]) ) continue;
+
+ int icsq = 2*csq->idx + j;
+ if ( icsq >= args->ncsq_max ) // more than ncsq_max consequences, so can't fit it in FMT
+ {
+ int ismpl = args->smpl->idx[i];
+ int print_warning = 1;
+ if ( args->quiet )
+ {
+ if ( args->quiet > 1 || args->ncsq_small_warned ) print_warning = 0;
+ args->ncsq_small_warned = 1;
+ }
+ if ( print_warning )
+ {
+ fprintf(pysam_stderr,"Warning: --ncsq %d is too small to annotate %s at %s:%d with %d-th csq\n",
+ args->ncsq_max/2,args->hdr->samples[ismpl],bcf_hdr_id2name(args->hdr,args->rid),vrec->line->pos+1,csq->idx+1);
+ if ( args->quiet ) fprintf(pysam_stderr,"(This warning is printed only once)\n");
+ }
+ break;
+ }
+ if ( vrec->nfmt < 1 + icsq/32 ) vrec->nfmt = 1 + icsq/32;
+ vrec->smpl[i*args->nfmt_bcsq + icsq/32] |= 1 << (icsq % 32);
+ }
+ }
+}
+int test_utr(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ // note that the off-by-one extension of rlen is deliberate to account for insertions
+ if ( !regidx_overlap(args->idx_utr,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_utr_t *utr = regitr_payload(args->itr, gf_utr_t*);
+ tscript_t *tr = splice.tr = utr->tr;
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = utr->which==prime5 ? CSQ_UTR5 : CSQ_UTR3;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+int test_splice(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_exon,chr,rec->pos,rec->pos + rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+ splice.check_acceptor = splice.check_donor = 1;
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ gf_exon_t *exon = regitr_payload(args->itr, gf_exon_t*);
+ splice.tr = exon->tr;
+ if ( !splice.tr->ncds ) continue; // not a coding transcript, no interest in splice sites
+
+ splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
+ splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
+
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ splice_csq(args, &splice, exon->beg, exon->end);
+ if ( splice.csq ) ret = 1;
+ }
+ }
+ free(splice.kref.s);
+ free(splice.kalt.s);
+ return ret;
+}
+int test_tscript(args_t *args, bcf1_t *rec)
+{
+ const char *chr = bcf_seqname(args->hdr,rec);
+ if ( !regidx_overlap(args->idx_tscript,chr,rec->pos,rec->pos+rec->rlen, args->itr) ) return 0;
+
+ splice_t splice;
+ splice_init(&splice, rec);
+
+ int i, ret = 0;
+ while ( regitr_overlap(args->itr) )
+ {
+ tscript_t *tr = splice.tr = regitr_payload(args->itr, tscript_t*);
+ for (i=1; i<rec->n_allele; i++)
+ {
+ if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
+ splice.vcf.alt = rec->d.allele[i];
+ int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
+ if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
+ csq_t csq;
+ memset(&csq, 0, sizeof(csq_t));
+ csq.pos = rec->pos;
+ csq.type.type = GF_is_coding(tr->type) ? CSQ_INTRON : CSQ_NON_CODING;
+ csq.type.biotype = tr->type;
+ csq.type.strand = tr->strand;
+ csq.type.trid = tr->id;
+ csq.type.gene = tr->gene->name;
+ csq_stage(args, &csq, rec);
+ ret = 1;
+ }
+ }
+ assert(!splice.kref.s);
+ assert(!splice.kalt.s);
+ return ret;
+}
+
+void process(args_t *args, bcf1_t **rec_ptr)
+{
+ if ( !rec_ptr )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ return;
+ }
+
+ bcf1_t *rec = *rec_ptr;
+
+ int call_csq = 1;
+ if ( !rec->n_allele ) call_csq = 0; // no alternate allele
+ else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*') ) call_csq = 0; // gVCF, no alt allele
+ else if ( args->filter )
+ {
+ call_csq = filter_test(args->filter, rec, NULL);
+ if ( args->filter_logic==FLT_EXCLUDE ) call_csq = call_csq ? 0 : 1;
+ }
+ if ( !call_csq )
+ {
+ if ( !args->out_fh ) return; // not a VCF output
+ vbuf_push(args, rec_ptr);
+ vbuf_flush(args);
+ return;
+ }
+
+ if ( args->rid != rec->rid )
+ {
+ hap_flush(args, REGIDX_MAX);
+ vbuf_flush(args);
+ }
+ args->rid = rec->rid;
+ vbuf_push(args, rec_ptr);
+
+ int hit = args->local_csq ? test_cds_local(args, rec) : test_cds(args, rec);
+ hit += test_utr(args, rec);
+ hit += test_splice(args, rec);
+ if ( !hit ) test_tscript(args, rec);
+
+ hap_flush(args, rec->pos-1);
+ vbuf_flush(args);
+
+ return;
+}
+
+const char *usage(void)
+{
+ return
+ "\n"
+ "About: Haplotype-aware consequence caller.\n"
+ "Usage: bcftools csq [options] in.vcf\n"
+ "\n"
+ "Required options:\n"
+ " -f, --fasta-ref <file> reference file in fasta format\n"
+ " -g, --gff-annot <file> gff3 annotation file\n"
+ "\n"
+ "CSQ options:\n"
+ " -c, --custom-tag <string> use this tag instead of the default BCSQ\n"
+ " -l, --local-csq localized predictions, consider only one VCF record at a time\n"
+ " -n, --ncsq <int> maximum number of consequences to consider per site [16]\n"
+ " -p, --phase <a|m|r|R|s> how to construct haplotypes and how to deal with unphased data: [r]\n"
+ " a: take GTs as is, create haplotypes regardless of phase (0/1 -> 0|1)\n"
+ " m: merge *all* GTs into a single haplotype (0/1 -> 1, 1/2 -> 1)\n"
+ " r: require phased GTs, throw an error on unphased het GTs\n"
+ " R: create non-reference haplotypes if possible (0/1 -> 1|1, 1/2 -> 1|2)\n"
+ " s: skip unphased GTs\n"
+ "Options:\n"
+ " -e, --exclude <expr> exclude sites for which the expression is true\n"
+ " -i, --include <expr> select sites for which the expression is true\n"
+ " -o, --output <file> write output to a file [standard output]\n"
+ " -O, --output-type <b|u|z|v|t> b: compressed BCF, u: uncompressed BCF, z: compressed VCF\n"
+ " v: uncompressed VCF, t: plain tab-delimited text output [v]\n"
+ " -q, --quiet suppress warning messages. Can be given two times for even less messages\n"
+ " -r, --regions <region> restrict to comma-separated list of regions\n"
+ " -R, --regions-file <file> restrict to regions listed in a file\n"
+ " -s, --samples <-|list> samples to include or \"-\" to apply all variants and ignore samples\n"
+ " -S, --samples-file <file> samples to include\n"
+ " -t, --targets <region> similar to -r but streams rather than index-jumps\n"
+ " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n"
+ "\n"
+ "Example:\n"
+ " bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
+ "\n"
+ " # GFF3 annotation files can be downloaded from Ensembl. e.g. for human:\n"
+ " ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/\n"
+ " ftp://ftp.ensembl.org/pub/grch37/release-84/gff3/homo_sapiens/\n"
+ "\n";
+}
+
+int main_csq(int argc, char *argv[])
+{
+ args_t *args = (args_t*) calloc(1,sizeof(args_t));
+ args->argc = argc; args->argv = argv;
+ args->output_type = FT_VCF;
+ args->bcsq_tag = "BCSQ";
+ args->ncsq_max = 2*16;
+
+ static struct option loptions[] =
+ {
+ {"help",0,0,'h'},
+ {"ncsq",1,0,'n'},
+ {"custom-tag",1,0,'c'},
+ {"local-csq",0,0,'l'},
+ {"gff-annot",1,0,'g'},
+ {"fasta-ref",1,0,'f'},
+ {"include",1,0,'i'},
+ {"exclude",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,NULL,'O'},
+ {"phase",1,0,'p'},
+ {"quiet",0,0,'q'},
+ {"regions",1,0,'r'},
+ {"regions-file",1,0,'R'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
+ {"targets",1,0,'t'},
+ {"targets-file",1,0,'T'},
+ {0,0,0,0}
+ };
+ int c, targets_is_file = 0, regions_is_file = 0;
+ char *targets_list = NULL, *regions_list = NULL;
+ while ((c = getopt_long(argc, argv, "?hr:R:t:T:i:e:f:o:O:g:s:S:p:qc:ln:",loptions,NULL)) >= 0)
+ {
+ switch (c)
+ {
+ case 'l': args->local_csq = 1; break;
+ case 'c': args->bcsq_tag = optarg; break;
+ case 'q': args->quiet++; break;
+ case 'p':
+ switch (optarg[0])
+ {
+ case 'a': args->phase = PHASE_AS_IS; break;
+ case 'm': args->phase = PHASE_MERGE; break;
+ case 'r': args->phase = PHASE_REQUIRE; break;
+ case 'R': args->phase = PHASE_NON_REF; break;
+ case 's': args->phase = PHASE_SKIP; break;
+ default: error("The -p code \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'f': args->fa_fname = optarg; break;
+ case 'g': args->gff_fname = optarg; break;
+ case 'n':
+ args->ncsq_max = 2 * atoi(optarg);
+ if ( args->ncsq_max <=0 ) error("Expected positive integer with -n, got %s\n", optarg);
+ break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ switch (optarg[0]) {
+ case 't': args->output_type = FT_TAB_TEXT; break;
+ case 'b': args->output_type = FT_BCF_GZ; break;
+ case 'u': args->output_type = FT_BCF; break;
+ case 'z': args->output_type = FT_VCF_GZ; break;
+ case 'v': args->output_type = FT_VCF; break;
+ default: error("The output type \"%s\" not recognised\n", optarg);
+ }
+ break;
+ case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
+ case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 'r': regions_list = optarg; break;
+ case 'R': regions_list = optarg; regions_is_file = 1; break;
+ case 's': args->sample_list = optarg; break;
+ case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
+ case 't': targets_list = optarg; break;
+ case 'T': targets_list = optarg; targets_is_file = 1; break;
+ case 'h':
+ case '?': error("%s",usage());
+ default: error("The option not recognised: %s\n\n", optarg); break;
+ }
+ }
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else error("%s", usage());
+ }
+ else fname = argv[optind];
+ if ( argc - optind>1 ) error("%s", usage());
+ if ( !args->fa_fname ) error("Missing the --fa-ref option\n");
+ if ( !args->gff_fname ) error("Missing the --gff option\n");
+ args->sr = bcf_sr_init();
+ if ( targets_list && bcf_sr_set_targets(args->sr, targets_list, targets_is_file, 0)<0 )
+ error("Failed to read the targets: %s\n", targets_list);
+ if ( regions_list && bcf_sr_set_regions(args->sr, regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", regions_list);
+ if ( !bcf_sr_add_reader(args->sr, fname) )
+ error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->sr->errnum));
+ args->hdr = bcf_sr_get_header(args->sr,0);
+
+ init_data(args);
+ while ( bcf_sr_next_line(args->sr) )
+ {
+ process(args, &args->sr->readers[0].buffer[0]);
+ }
+ process(args,NULL);
+
+ destroy_data(args);
+ bcf_sr_destroy(args->sr);
+ free(args);
+
+ return 0;
+}
+
#include <ctype.h>
#include <stdlib.h>
+#include <strings.h>
#include <errno.h>
#include <math.h>
#include <wordexp.h>
#include <htslib/hts_defs.h>
#include <htslib/vcfutils.h>
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+uint64_t bcf_double_missing = 0x7ff0000000000001;
+uint64_t bcf_double_vector_end = 0x7ff0000000000002;
+static inline void bcf_double_set(double *ptr, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.i = value;
+ *ptr = u.d;
+}
+static inline int bcf_double_test(double d, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.d = d;
+ return u.i==value ? 1 : 0;
+}
+#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
+#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
+#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
+#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
+
+
typedef struct _token_t
{
// read-only values, same for all VCF lines
int tok_type; // one of the TOK_* keys below
char *key; // set only for string constants, otherwise NULL
char *tag; // for debugging and printout only, VCF tag name
- float threshold; // filtering threshold
+ double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- float *values; // In case str_value is set, values[0] is one sample's string length
+ double *values; // In case str_value is set, values[0] is one sample's string length
char *str_value; // and values[0]*nsamples gives the total length;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
int nfilters;
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
- int max_unpack, mtmpi, nsamples;
+ float *tmpf;
+ int max_unpack, mtmpi, mtmpf, nsamples;
};
tok->nvalues = 0;
else
{
- tok->values[0] = line->qual;
+ tok->values[0] = (double)line->qual;
tok->nvalues = 1;
}
}
static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->values[0] = bcf_get_variant_types(line);
+ if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1
+ else tok->values[0] = ((int)tok->values[0]) << 1;
tok->nvalues = 1;
}
static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->str_value = NULL;
}
}
+static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
+ int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
+ if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
+ return a&b ? 0 : 1;
+}
static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
{
int i;
}
/**
- * bcf_get_info_value() - get single INFO value, int or float
+ * bcf_get_info_value() - get single INFO value, int64_t or double
* @line: BCF line
* @info_id: tag ID, as returned by bcf_hdr_id2int
* @ivec: 0-based index to retrieve, -1 when single value is expected
bcf_info_t *info = &line->d.info[j];
if ( info->len == 1 )
{
- if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f;
- else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i;
+ if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f;
+ else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i;
return 1;
}
return 1; \
}
switch (info->type) {
- case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break;
- case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
- case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
default: fprintf(stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
{
if ( tok->idx==-2 )
{
- int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
- tok->nvalues = n;
- hts_expand(float,n,tok->mvalues,tok->values);
- for (i=0; i<n; i++) tok->values[i] = flt->tmpi[i];
+ int i;
+ tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ }
}
else
{
- int32_t value;
+ int64_t value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
{
if ( tok->idx==-2 )
{
- tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues);
- if ( tok->nvalues<0 ) tok->nvalues = 0;
+ int i;
+ tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
+ else tok->values[i] = flt->tmpf[i];
+ }
}
else
{
- float value;
+ double value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
else
{
int is_missing = 1;
- hts_expand(float,tok->nvalues,tok->mvalues,tok->values);
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
for (i=0; i<tok->nvalues; i++)
{
if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
- bcf_float_set_missing(tok->values[i]);
+ bcf_double_set_missing(tok->values[i]);
else
{
tok->values[i] = flt->tmpi[i];
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 )
+ int i;
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ {
tok->nvalues = tok->nsamples = 0; // missing values
- else if ( tok->idx >= 0 )
+ }
+ else
{
- int i, nsmpl, nvals;
- nsmpl = bcf_hdr_nsamples(flt->hdr);
- nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nsamples = tok->nvalues = 0; // the index is too big
- else
+ int is_missing = 1;
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
{
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nsamples = tok->nvalues = nsmpl;
+ if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ bcf_double_set_missing(tok->values[i]);
+ else
+ {
+ tok->values[i] = flt->tmpf[i];
+ is_missing = 0;
+ }
+ }
+ if ( is_missing ) tok->nvalues = 0;
+ else if ( tok->idx >= 0 )
+ {
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nvalues = nsmpl;
+ }
}
}
tok->nsamples = tok->nvalues;
tok->nvalues = tok->nsamples = 0;
return;
}
- int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
kstring_t str;
gt_length_too_big:
{
int plen = str.l;
- #define BRANCH(type_t) { \
- type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
- if ( !(ptr[0]>>1) ) kputc('.',&str); \
- }
- switch (fmt->type) {
- case BCF_BT_INT8: BRANCH(int8_t); break;
- case BCF_BT_INT16: BRANCH(int16_t); break;
- case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
- }
- #undef BRANCH
-
- if ( plen==str.l )
+ bcf_format_gt(fmt, i, &str);
+ kputc_(0,&str);
+ if ( str.l - plen > blen )
{
- bcf_format_gt(fmt, i, &str);
- if ( str.l - plen > blen )
- {
- // too many alternate alleles or ploidy is too large, the genotype does not fit
- // three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
- blen *= 2;
- goto gt_length_too_big;
- }
+ // too many alternate alleles or ploidy is too large, the genotype does not fit
+ // three characters ("0/0" vs "10/10").
+ tok->str_value = str.s;
+ blen *= 2;
+ goto gt_length_too_big;
}
plen = str.l - plen;
}
else
{
- hts_expand(float,line->n_allele,tok->mvalues,tok->values);
+ hts_expand(double,line->n_allele,tok->mvalues,tok->values);
for (i=1; i<line->n_allele; i++)
tok->values[i-1] = flt->tmpi[i];
tok->nvalues = line->n_allele - 1;
if ( !tok->nvalues ) return;
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
}
static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
{
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
}
}
static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = -HUGE_VAL;
+ double val = -HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
{
- if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
}
tok->values[0] = val;
tok->nvalues = 1;
}
static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = HUGE_VAL;
+ double val = HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = n ? val / n : 0;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) continue; \
- if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \
+ if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
has_values = 1; \
(atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
} \
} \
else if ( (btok)->nsamples ) \
{ \
- hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
+ hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
for (i=0; i<(btok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \
+ if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \
+ if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
for (i=0; i<btok->nsamples; i++)
atok->pass_samples[i] = btok->pass_samples[i];
atok->nsamples = btok->nsamples;
+ atok->nvalues = 1;
return btok->pass_site;
}
if ( !btok->nvalues ) // missing value in b
+ {
+ btok->nvalues = 1;
return atok->pass_site;
+ }
if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
if ( !atok->nsamples )
if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
token_t *tok = (atok)->is_missing ? (btok) : (atok); \
(ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
+ tok->nvalues = 1; \
}
#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
has_values = 1; \
if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
else (atok)->pass_samples[i] = 0; \
} \
else if ( (atok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(atok)->nsamples; i++) \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
+ /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (btok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(btok)->nsamples; i++) \
{ \
- for (i=0; i<(btok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
}
return pass_site;
}
-static int regex_vector_strings(token_t *atok, token_t *btok)
+static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
- int ret = regexec(btok->regex, atok->str_value, 0,NULL,0);
- return ret==0 ? 1 : 0;
+ int i, pass_site = 0;
+ if ( atok->nsamples )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ char *ptr = atok->str_value + i*(int)atok->values[0];
+ atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
+ if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ return pass_site;
+ }
+ pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ if ( negate ) pass_site = pass_site ? 0 : 1;
+ return pass_site;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int quote = str[0];
if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
tok->key = (char*) calloc(len-1,sizeof(char));
- hts_expand(float,1,tok->mvalues,tok->values);
+ hts_expand(double,1,tok->mvalues,tok->values);
tok->values[0] = len-2;
memcpy(tok->key,str+1,len-2);
tok->key[len-2] = 0;
return 0;
}
- // is it a value?
+ // is it a value? Here we parse as integer/float separately and use strtof
+ // rather than strtod, because the more accurate double representation
+ // would invalidate floating point comparisons like QUAL=59.2, obtained via
+ // htslib/vcf parser
char *end;
- errno = 0;
- tok->threshold = strtod(tmp.s, &end);
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ tok->threshold = strtol(tmp.s, &end, 10); // integer?
+ if ( end - tmp.s != strlen(tmp.s) )
+ {
+ errno = 0;
+ tok->threshold = strtof(tmp.s, &end); // float?
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ }
if ( tmp.s ) free(tmp.s);
return 0;
// Look for j="." and k numeric type
int j = i-1, k = i-2;
if ( !out[j].is_str ) { k = i-1, j = i-2; }
- if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) )
+ if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
{
int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
- if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
- if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
}
if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
if ( !out[j].key )
error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
out[j].regex = (regex_t *) malloc(sizeof(regex_t));
- if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) )
+ int cflags = REG_NOSUB;
+ int len = strlen(out[j].key);
+ if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' )
+ {
+ out[j].key[len-2] = 0;
+ cflags |= REG_ICASE;
+ }
+ if ( regcomp(out[j].regex, out[j].key, cflags) )
error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
}
if ( out[i].tok_type!=TOK_VAL ) continue;
if ( !strcmp(out[i].tag,"TYPE") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; }
- else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
- out[j].tag = out[j].key; out[j].key = NULL;
- i = j;
+ int itok, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
+ else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
+ if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ i = itok;
continue;
}
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value"
- if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and !=
- if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE;
- if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ;
- if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key )
+ int itok = i, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
+ else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( strcmp(".",out[j].key) )
+ if ( strcmp(".",out[ival].key) )
{
- out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
- if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
- error("The filter \"%s\" not present in the VCF header\n", out[j].key);
+ out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
+ error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
}
else
- out[j].hdr_id = -1;
- out[j].tag = out[j].key; out[j].key = NULL;
- out[i].hdr_id = out[j].hdr_id;
- i = j;
+ out[ival].hdr_id = -1;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ out[itok].hdr_id = out[ival].hdr_id;
continue;
}
}
else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
- hts_expand0(float,1,out[i].mvalues,out[i].values);
+ hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
free(filter->flt_stack);
free(filter->str);
free(filter->tmpi);
+ free(filter->tmpf);
free(filter);
}
}
int is_true = 0;
- if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
+ if ( filter->filters[i].comparator )
+ is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
+ else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
{
int skip = 0;
if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
{
if ( is_str==2 )
- {
- is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]);
- if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1;
- }
+ is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
else
error("The regex operator can be used on strings only: %s\n", filter->str);
}
#include <ctype.h>
#include <stdlib.h>
+#include <strings.h>
#include <errno.h>
#include <math.h>
#include <wordexp.h>
#include <htslib/hts_defs.h>
#include <htslib/vcfutils.h>
+#ifndef __FUNCTION__
+# define __FUNCTION__ __func__
+#endif
+
+uint64_t bcf_double_missing = 0x7ff0000000000001;
+uint64_t bcf_double_vector_end = 0x7ff0000000000002;
+static inline void bcf_double_set(double *ptr, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.i = value;
+ *ptr = u.d;
+}
+static inline int bcf_double_test(double d, uint64_t value)
+{
+ union { uint64_t i; double d; } u;
+ u.d = d;
+ return u.i==value ? 1 : 0;
+}
+#define bcf_double_set_vector_end(x) bcf_double_set(&(x),bcf_double_vector_end)
+#define bcf_double_set_missing(x) bcf_double_set(&(x),bcf_double_missing)
+#define bcf_double_is_vector_end(x) bcf_double_test((x),bcf_double_vector_end)
+#define bcf_double_is_missing(x) bcf_double_test((x),bcf_double_missing)
+
+
typedef struct _token_t
{
// read-only values, same for all VCF lines
int tok_type; // one of the TOK_* keys below
char *key; // set only for string constants, otherwise NULL
char *tag; // for debugging and printout only, VCF tag name
- float threshold; // filtering threshold
+ double threshold; // filtering threshold
int hdr_id, type; // BCF header lookup ID and one of BCF_HT_* types
int idx; // 0-based index to VCF vectors, -1: not a vector, -2: any field ([*])
void (*setter)(filter_t *, bcf1_t *, struct _token_t *);
regex_t *regex; // precompiled regex for string comparison
// modified on filter evaluation at each VCF line
- float *values; // In case str_value is set, values[0] is one sample's string length
+ double *values; // In case str_value is set, values[0] is one sample's string length
char *str_value; // and values[0]*nsamples gives the total length;
int is_str, is_missing; // is_missing is set only for constants, variables are controled via nvalues
int pass_site; // -1 not applicable, 0 fails, >0 pass
int nfilters;
token_t *filters, **flt_stack; // filtering input tokens (in RPN) and evaluation stack
int32_t *tmpi;
- int max_unpack, mtmpi, nsamples;
+ float *tmpf;
+ int max_unpack, mtmpi, mtmpf, nsamples;
};
tok->nvalues = 0;
else
{
- tok->values[0] = line->qual;
+ tok->values[0] = (double)line->qual;
tok->nvalues = 1;
}
}
static void filters_set_type(filter_t *flt, bcf1_t *line, token_t *tok)
{
tok->values[0] = bcf_get_variant_types(line);
+ if ( !tok->values[0] ) tok->values[0] = 1; // mistake in htslib: VCF_* should start with 1
+ else tok->values[0] = ((int)tok->values[0]) << 1;
tok->nvalues = 1;
}
static void filters_set_info(filter_t *flt, bcf1_t *line, token_t *tok)
tok->str_value = NULL;
}
}
+static int filters_cmp_bit_and(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
+{
+ int a = (int)(atok->nvalues?atok->values[0]:atok->threshold);
+ int b = (int)(btok->nvalues?btok->values[0]:btok->threshold);
+ if ( op_type==TOK_LIKE ) return a&b ? 1 : 0;
+ return a&b ? 0 : 1;
+}
static int filters_cmp_filter(token_t *atok, token_t *btok, int op_type, bcf1_t *line)
{
int i;
}
/**
- * bcf_get_info_value() - get single INFO value, int or float
+ * bcf_get_info_value() - get single INFO value, int64_t or double
* @line: BCF line
* @info_id: tag ID, as returned by bcf_hdr_id2int
* @ivec: 0-based index to retrieve, -1 when single value is expected
bcf_info_t *info = &line->d.info[j];
if ( info->len == 1 )
{
- if ( info->type==BCF_BT_FLOAT ) *((float*)value) = info->v1.f;
- else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int*)value) = info->v1.i;
+ if ( info->type==BCF_BT_FLOAT ) *((double*)value) = info->v1.f;
+ else if ( info->type==BCF_BT_INT8 || info->type==BCF_BT_INT16 || info->type==BCF_BT_INT32 ) *((int64_t*)value) = info->v1.i;
return 1;
}
return 1; \
}
switch (info->type) {
- case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int); break;
- case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int); break;
- case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int); break;
- case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), float); break;
+ case BCF_BT_INT8: BRANCH(int8_t, p[j]==bcf_int8_missing, p[j]==bcf_int8_vector_end, int64_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t, p[j]==bcf_int16_missing, p[j]==bcf_int16_vector_end, int64_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t, p[j]==bcf_int32_missing, p[j]==bcf_int32_vector_end, int64_t); break;
+ case BCF_BT_FLOAT: BRANCH(float, bcf_float_is_missing(p[j]), bcf_float_is_vector_end(p[j]), double); break;
default: fprintf(pysam_stderr,"todo: type %d\n", info->type); exit(1); break;
}
#undef BRANCH
{
if ( tok->idx==-2 )
{
- int i, n = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
- tok->nvalues = n;
- hts_expand(float,n,tok->mvalues,tok->values);
- for (i=0; i<n; i++) tok->values[i] = flt->tmpi[i];
+ int i;
+ tok->nvalues = bcf_get_info_int32(flt->hdr,line,tok->tag,&flt->tmpi,&flt->mtmpi);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++) tok->values[i] = flt->tmpi[i];
+ }
}
else
{
- int32_t value;
+ int64_t value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
{
if ( tok->idx==-2 )
{
- tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues);
- if ( tok->nvalues<0 ) tok->nvalues = 0;
+ int i;
+ tok->nvalues = bcf_get_info_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf);
+ if ( tok->nvalues<=0 ) tok->nvalues = 0;
+ else
+ {
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
+ if ( bcf_float_is_missing(flt->tmpf[i]) ) bcf_double_set_missing(tok->values[i]);
+ else tok->values[i] = flt->tmpf[i];
+ }
}
else
{
- float value;
+ double value;
if ( bcf_get_info_value(line,tok->hdr_id,tok->idx,&value) <= 0 )
tok->nvalues = 0;
else
else
{
int is_missing = 1;
- hts_expand(float,tok->nvalues,tok->mvalues,tok->values);
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
for (i=0; i<tok->nvalues; i++)
{
if ( flt->tmpi[i]==bcf_int32_missing || flt->tmpi[i]==bcf_int32_vector_end )
- bcf_float_set_missing(tok->values[i]);
+ bcf_double_set_missing(tok->values[i]);
else
{
tok->values[i] = flt->tmpi[i];
}
static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
{
- if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&tok->values,&tok->mvalues))<=0 )
+ int i;
+ if ( (tok->nvalues=bcf_get_format_float(flt->hdr,line,tok->tag,&flt->tmpf,&flt->mtmpf))<=0 )
+ {
tok->nvalues = tok->nsamples = 0; // missing values
- else if ( tok->idx >= 0 )
+ }
+ else
{
- int i, nsmpl, nvals;
- nsmpl = bcf_hdr_nsamples(flt->hdr);
- nvals = tok->nvalues / nsmpl;
- if ( tok->idx >= nvals )
- tok->nsamples = tok->nvalues = 0; // the index is too big
- else
+ int is_missing = 1;
+ hts_expand(double,tok->nvalues,tok->mvalues,tok->values);
+ for (i=0; i<tok->nvalues; i++)
{
- for (i=0; i<nsmpl; i++)
- tok->values[i] = tok->values[i*nvals+tok->idx];
- tok->nsamples = tok->nvalues = nsmpl;
+ if ( bcf_float_is_missing(flt->tmpf[i]) || bcf_float_is_vector_end(flt->tmpf[i]) )
+ bcf_double_set_missing(tok->values[i]);
+ else
+ {
+ tok->values[i] = flt->tmpf[i];
+ is_missing = 0;
+ }
+ }
+ if ( is_missing ) tok->nvalues = 0;
+ else if ( tok->idx >= 0 )
+ {
+ int nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int nvals = tok->nvalues / nsmpl;
+ if ( tok->idx >= nvals )
+ tok->nvalues = 0; // the index is too big
+ else
+ {
+ for (i=0; i<nsmpl; i++)
+ tok->values[i] = tok->values[i*nvals+tok->idx];
+ tok->nvalues = nsmpl;
+ }
}
}
tok->nsamples = tok->nvalues;
tok->nvalues = tok->nsamples = 0;
return;
}
- int i, blen = 3, nsmpl = bcf_hdr_nsamples(flt->hdr);
+ int i, blen = 4, nsmpl = bcf_hdr_nsamples(flt->hdr);
kstring_t str;
gt_length_too_big:
{
int plen = str.l;
- #define BRANCH(type_t) { \
- type_t *ptr = (type_t*) (fmt->p + i*fmt->size); \
- if ( !(ptr[0]>>1) ) kputc('.',&str); \
- }
- switch (fmt->type) {
- case BCF_BT_INT8: BRANCH(int8_t); break;
- case BCF_BT_INT16: BRANCH(int16_t); break;
- case BCF_BT_INT32: BRANCH(int32_t); break;
- default: fprintf(pysam_stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
- }
- #undef BRANCH
-
- if ( plen==str.l )
+ bcf_format_gt(fmt, i, &str);
+ kputc_(0,&str);
+ if ( str.l - plen > blen )
{
- bcf_format_gt(fmt, i, &str);
- if ( str.l - plen > blen )
- {
- // too many alternate alleles or ploidy is too large, the genotype does not fit
- // three characters ("0/0" vs "10/10").
- tok->str_value = str.s;
- blen *= 2;
- goto gt_length_too_big;
- }
+ // too many alternate alleles or ploidy is too large, the genotype does not fit
+ // three characters ("0/0" vs "10/10").
+ tok->str_value = str.s;
+ blen *= 2;
+ goto gt_length_too_big;
}
plen = str.l - plen;
}
else
{
- hts_expand(float,line->n_allele,tok->mvalues,tok->values);
+ hts_expand(double,line->n_allele,tok->mvalues,tok->values);
for (i=1; i<line->n_allele; i++)
tok->values[i-1] = flt->tmpi[i];
tok->nvalues = line->n_allele - 1;
if ( !tok->nvalues ) return;
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
}
static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
{
int i, an = flt->tmpi[0];
for (i=0; i<tok->nvalues; i++)
{
- tok->values[i] /= (float)an;
+ tok->values[i] /= (double)an;
if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
}
}
static void set_max(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = -HUGE_VAL;
+ double val = -HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
{
- if ( !bcf_float_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val < tok->values[i] ) val = tok->values[i];
}
tok->values[0] = val;
tok->nvalues = 1;
}
static void set_min(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = HUGE_VAL;
+ double val = HUGE_VAL;
int i;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
+ if ( !bcf_double_is_missing(tok->values[i]) && val > tok->values[i] ) val = tok->values[i];
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_avg(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = n ? val / n : 0;
tok->nvalues = 1;
tok->nsamples = 0;
}
static void set_sum(filter_t *flt, bcf1_t *line, token_t *tok)
{
- float val = 0;
+ double val = 0;
int i, n = 0;
for (i=0; i<tok->nvalues; i++)
- if ( !bcf_float_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
+ if ( !bcf_double_is_missing(tok->values[i]) ) { val += tok->values[i]; n++; }
tok->values[0] = val;
tok->nvalues = 1;
tok->nsamples = 0;
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) continue; \
- if ( bcf_float_is_missing((btok)->values[i]) ) { bcf_float_set_missing((atok)->values[i]); continue; } \
+ if ( bcf_double_is_missing((atok)->values[i]) ) continue; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { bcf_double_set_missing((atok)->values[i]); continue; } \
has_values = 1; \
(atok)->values[i] = (atok)->values[i] AOP (btok)->values[i]; \
} \
} \
else if ( (btok)->nsamples ) \
{ \
- hts_expand(float,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
+ hts_expand(double,(btok)->nvalues,(atok)->mvalues,(atok)->values); \
for (i=0; i<(btok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) || bcf_float_is_missing((btok)->values[i]) ) \
+ if ( bcf_double_is_missing((atok)->values[0]) || bcf_double_is_missing((btok)->values[i]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
{ \
for (i=0; i<(atok)->nvalues; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) || bcf_float_is_missing((btok)->values[0]) ) \
+ if ( bcf_double_is_missing((atok)->values[i]) || bcf_double_is_missing((btok)->values[0]) ) \
{ \
- bcf_float_set_missing((atok)->values[i]); \
+ bcf_double_set_missing((atok)->values[i]); \
continue; \
} \
has_values = 1; \
for (i=0; i<btok->nsamples; i++)
atok->pass_samples[i] = btok->pass_samples[i];
atok->nsamples = btok->nsamples;
+ atok->nvalues = 1;
return btok->pass_site;
}
if ( !btok->nvalues ) // missing value in b
+ {
+ btok->nvalues = 1;
return atok->pass_site;
+ }
if ( !atok->nsamples && !btok->nsamples ) return atok->pass_site || btok->pass_site;
if ( !atok->nsamples )
if ( (atok)->nsamples || (btok)->nsamples ) error("todo: Querying of missing values in FORMAT\n"); \
token_t *tok = (atok)->is_missing ? (btok) : (atok); \
(ret) = ( tok->nvalues CMP_OP 1 ) ? 0 : 1; \
+ tok->nvalues = 1; \
}
#define CMP_VECTORS(atok,btok,CMP_OP,ret) \
{ \
for (i=0; i<(atok)->nsamples; i++) \
{ \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
has_values = 1; \
if ( (atok)->values[i] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
else (atok)->pass_samples[i] = 0; \
} \
else if ( (atok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((btok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(atok)->nsamples; i++) \
{ \
- for (i=0; i<(atok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
+ /*if ( bcf_double_is_missing((atok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; }*/ \
+ has_values = 1; \
+ if ( (atok)->values[i] CMP_OP (btok)->values[0] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (btok)->nsamples ) \
{ \
- if ( bcf_float_is_missing((atok)->values[0]) ) { (atok)->nvalues = 0; (atok)->nsamples = 0; (ret) = 0; } \
- else \
+ for (i=0; i<(btok)->nsamples; i++) \
{ \
- for (i=0; i<(btok)->nsamples; i++) \
- { \
- if ( bcf_float_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
- has_values = 1; \
- if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
- else (atok)->pass_samples[i] = 0; \
- } \
- (atok)->nvalues = (btok)->nvalues; \
- (atok)->nsamples = (btok)->nsamples; \
+ if ( bcf_double_is_missing((btok)->values[i]) ) { (atok)->pass_samples[i] = 0; continue; } \
+ has_values = 1; \
+ if ( (atok)->values[0] CMP_OP (btok)->values[i] ) { (atok)->pass_samples[i] = 1; pass_site = 1; } \
+ else (atok)->pass_samples[i] = 0; \
} \
+ (atok)->nvalues = (btok)->nvalues; \
+ (atok)->nsamples = (btok)->nsamples; \
if ( !has_values ) (atok)->nvalues = 0; \
} \
else if ( (atok)->idx==-2 || (btok)->idx==-2 ) \
}
return pass_site;
}
-static int regex_vector_strings(token_t *atok, token_t *btok)
+static int regex_vector_strings(token_t *atok, token_t *btok, int negate)
{
- int ret = regexec(btok->regex, atok->str_value, 0,NULL,0);
- return ret==0 ? 1 : 0;
+ int i, pass_site = 0;
+ if ( atok->nsamples )
+ {
+ for (i=0; i<atok->nsamples; i++)
+ {
+ char *ptr = atok->str_value + i*(int)atok->values[0];
+ atok->pass_samples[i] = regexec(btok->regex, ptr, 0,NULL,0) ? 0 : 1;
+ if ( negate ) atok->pass_samples[i] = atok->pass_samples[i] ? 0 : 1;
+ pass_site |= atok->pass_samples[i];
+ }
+ return pass_site;
+ }
+ pass_site = regexec(btok->regex, atok->str_value, 0,NULL,0) ? 0 : 1;
+ if ( negate ) pass_site = pass_site ? 0 : 1;
+ return pass_site;
}
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
int quote = str[0];
if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
tok->key = (char*) calloc(len-1,sizeof(char));
- hts_expand(float,1,tok->mvalues,tok->values);
+ hts_expand(double,1,tok->mvalues,tok->values);
tok->values[0] = len-2;
memcpy(tok->key,str+1,len-2);
tok->key[len-2] = 0;
return 0;
}
- // is it a value?
+ // is it a value? Here we parse as integer/float separately and use strtof
+ // rather than strtod, because the more accurate double representation
+ // would invalidate floating point comparisons like QUAL=59.2, obtained via
+ // htslib/vcf parser
char *end;
- errno = 0;
- tok->threshold = strtod(tmp.s, &end);
- if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ tok->threshold = strtol(tmp.s, &end, 10); // integer?
+ if ( end - tmp.s != strlen(tmp.s) )
+ {
+ errno = 0;
+ tok->threshold = strtof(tmp.s, &end); // float?
+ if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);
+ }
if ( tmp.s ) free(tmp.s);
return 0;
// Look for j="." and k numeric type
int j = i-1, k = i-2;
if ( !out[j].is_str ) { k = i-1, j = i-2; }
- if ( out[k].hdr_id>0 && out[j].is_str && !strcmp(".",out[j].key) )
+ if ( out[k].hdr_id>0 && out[j].is_str && out[j].key && !strcmp(".",out[j].key) )
{
int type = bcf_hdr_id2type(filter->hdr,out[k].type,out[k].hdr_id);
- if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
- if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_float_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_INT ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
+ if ( type==BCF_HT_REAL ) { out[j].is_str = 0; out[j].is_missing = 1; bcf_double_set_missing(out[j].values[0]); }
}
}
if ( out[i].tok_type==TOK_LIKE || out[i].tok_type==TOK_NLIKE )
if ( !out[j].key )
error("Could not parse the expression, wrong value for regex operator: %s\n", filter->str);
out[j].regex = (regex_t *) malloc(sizeof(regex_t));
- if ( regcomp(out[j].regex, out[j].key, REG_ICASE|REG_NOSUB) )
+ int cflags = REG_NOSUB;
+ int len = strlen(out[j].key);
+ if ( len>2 && out[j].key[len-1]=='i' && out[j].key[len-2]=='/' && out[j].key[len-3]!='\\' )
+ {
+ out[j].key[len-2] = 0;
+ cflags |= REG_ICASE;
+ }
+ if ( regcomp(out[j].regex, out[j].key, cflags) )
error("Could not compile the regex expression \"%s\": %s\n", out[j].key,filter->str);
}
if ( out[i].tok_type!=TOK_VAL ) continue;
if ( !strcmp(out[i].tag,"TYPE") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) { out[j].threshold = VCF_SNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) { out[j].threshold = VCF_INDEL; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) { out[j].threshold = VCF_MNP; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"other") ) { out[j].threshold = VCF_OTHER; out[j].is_str = 0; }
- else if ( !strcasecmp(out[j].key,"ref") ) { out[j].threshold = VCF_REF; out[j].is_str = 0; }
- else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
- out[j].tag = out[j].key; out[j].key = NULL;
- i = j;
+ int itok, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+1].tok_type==TOK_LIKE || out[i+1].tok_type==TOK_NLIKE ) ival = i - 1, itok = i + 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) itok = i + 2, ival = i + 1;
+ else if ( out[i+2].tok_type==TOK_LIKE || out[i+2].tok_type==TOK_NLIKE ) itok = i + 2, ival = i + 1;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( !strcasecmp(out[ival].key,"snp") || !strcasecmp(out[ival].key,"snps") ) { out[ival].threshold = VCF_SNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"indel") || !strcasecmp(out[ival].key,"indels") ) { out[ival].threshold = VCF_INDEL<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"mnp") || !strcasecmp(out[ival].key,"mnps") ) { out[ival].threshold = VCF_MNP<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"other") ) { out[ival].threshold = VCF_OTHER<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"bnd") ) { out[ival].threshold = VCF_BND<<1; out[ival].is_str = 0; }
+ else if ( !strcasecmp(out[ival].key,"ref") ) { out[ival].threshold = 1; out[ival].is_str = 0; }
+ else error("The type \"%s\" not recognised: %s\n", out[ival].key, filter->str);
+ if ( out[itok].tok_type==TOK_LIKE || out[itok].tok_type==TOK_NLIKE ) out[itok].comparator = filters_cmp_bit_and;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ i = itok;
continue;
}
if ( !strcmp(out[i].tag,"FILTER") )
{
if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
- int j = i+1;
- if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; // the expression has "value"=FILTER rather than FILTER="value"
- if ( out[j].tok_type==TOK_LIKE ) out[j].tok_type = TOK_EQ; // for FILTER, ~ and !~ work the same way as = and !=
- if ( out[j].tok_type==TOK_NLIKE ) out[j].tok_type = TOK_NE;
- if ( out[j+1].tok_type==TOK_LIKE ) out[j+1].tok_type = TOK_EQ;
- if ( out[j+1].tok_type==TOK_NLIKE ) out[j+1].tok_type = TOK_NE;
- if ( out[j].tok_type!=TOK_VAL || !out[j].key )
+ int itok = i, ival;
+ if ( out[i+1].tok_type==TOK_EQ || out[i+1].tok_type==TOK_NE ) ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_LIKE ) out[i+1].tok_type = TOK_EQ, ival = i - 1;
+ else if ( out[i+1].tok_type==TOK_NLIKE ) out[i+1].tok_type = TOK_NE, ival = i - 1;
+ else if ( out[i+2].tok_type==TOK_EQ || out[i+2].tok_type==TOK_NE ) ival = ++i;
+ else if ( out[i+2].tok_type==TOK_LIKE ) out[i+2].tok_type = TOK_EQ, ival = ++i;
+ else if ( out[i+2].tok_type==TOK_NLIKE ) out[i+2].tok_type = TOK_NE, ival = ++i;
+ else error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
+ if ( out[ival].tok_type!=TOK_VAL || !out[ival].key )
error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
- if ( strcmp(".",out[j].key) )
+ if ( strcmp(".",out[ival].key) )
{
- out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
- if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
- error("The filter \"%s\" not present in the VCF header\n", out[j].key);
+ out[ival].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[ival].key);
+ if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[ival].hdr_id) )
+ error("The filter \"%s\" not present in the VCF header\n", out[ival].key);
}
else
- out[j].hdr_id = -1;
- out[j].tag = out[j].key; out[j].key = NULL;
- out[i].hdr_id = out[j].hdr_id;
- i = j;
+ out[ival].hdr_id = -1;
+ out[ival].tag = out[ival].key; out[ival].key = NULL;
+ out[itok].hdr_id = out[ival].hdr_id;
continue;
}
}
else if ( out[i].tok_type==TOK_SUM ) { out[i].setter = set_sum; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_ABS ) { out[i].setter = set_abs; out[i].tok_type = TOK_FUNC; }
else if ( out[i].tok_type==TOK_LEN ) { out[i].setter = set_strlen; out[i].tok_type = TOK_FUNC; }
- hts_expand0(float,1,out[i].mvalues,out[i].values);
+ hts_expand0(double,1,out[i].mvalues,out[i].values);
if ( filter->nsamples )
{
out[i].pass_samples = (uint8_t*)malloc(filter->nsamples);
free(filter->flt_stack);
free(filter->str);
free(filter->tmpi);
+ free(filter->tmpf);
free(filter);
}
}
int is_true = 0;
- if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
+ if ( filter->filters[i].comparator )
+ is_true = filter->filters[i].comparator(filter->flt_stack[nstack-1],filter->flt_stack[nstack-2],filter->filters[i].tok_type,line);
+ else if ( !filter->flt_stack[nstack-1]->nvalues || !filter->flt_stack[nstack-2]->nvalues )
{
int skip = 0;
if ( !filter->flt_stack[nstack-2]->is_missing && !filter->flt_stack[nstack-1]->is_missing ) skip = 1;
else if ( filter->filters[i].tok_type == TOK_LIKE || filter->filters[i].tok_type == TOK_NLIKE )
{
if ( is_str==2 )
- {
- is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1]);
- if ( filter->filters[i].tok_type == TOK_NLIKE ) is_true = is_true ? 0 : 1;
- }
+ is_true = regex_vector_strings(filter->flt_stack[nstack-2],filter->flt_stack[nstack-1], filter->filters[i].tok_type == TOK_LIKE ? 0 : 1);
else
error("The regex operator can be used on strings only: %s\n", filter->str);
}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <stdlib.h>
+#include "bcftools.h"
+#include "hclust.h"
+
+typedef struct _node_t
+{
+ struct _node_t *akid, *bkid, *next, *prev, *parent;
+ int id, idx; // id: unique node id; idx: current index to pdist
+ float value; // max pairwise dist of elements within the node
+}
+node_t;
+
+struct _hclust_t
+{
+ int ndat, nclust; // ndat: number of elements (pdist matrix size); nclust: current number of clusters
+ float *pdist; // pairwise cluster distances, diagonal matrix accessed via the PDIST macro
+ node_t *first, *last; // clusters are maintained in a double-linked list
+ node_t **rmme; // convenience array to remove all allocated nodes at the end
+ int nrmme;
+ kstring_t str; // (for debugging) pointer to str.s is returned by create_dot()
+ char **dbg; // (for debugging) created by create_list() via set_threshold() and returned by explain()
+ int ndbg, mdbg;
+};
+
+node_t *append_node(hclust_t *clust, int idx)
+{
+ node_t *node = (node_t*) calloc(1,sizeof(node_t));
+
+ clust->nclust++;
+ node->id = clust->nrmme;
+ node->idx = idx;
+ if ( !clust->first )
+ {
+ clust->first = node;
+ clust->last = node;
+ }
+ else
+ {
+ node->prev = clust->last;
+ clust->last->next = node;
+ clust->last = node;
+ }
+
+ if ( clust->nrmme >= clust->ndat*2 ) error("hclust fixme: %d vs %d\n",clust->nrmme,clust->ndat);
+ clust->rmme[clust->nrmme++] = node;
+
+ return node;
+}
+void remove_node(hclust_t *clust, node_t *node)
+{
+ if ( node==clust->first ) clust->first = node->next;
+ if ( node==clust->last ) clust->last = node->prev;
+ if ( node->next ) node->next->prev = node->prev;
+ if ( node->prev ) node->prev->next = node->next;
+ clust->nclust--;
+}
+
+#if DEBUG
+void hclust_debug(hclust_t *clust)
+{
+ int i;
+ fprintf(stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust);
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ int akid = node->akid ? node->akid->id : -1;
+ int bkid = node->bkid ? node->bkid->id : -1;
+ int akidx = node->akid ? node->akid->idx : -1;
+ int bkidx = node->bkid ? node->bkid->idx : -1;
+ fprintf(stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx);
+ }
+
+ int j;
+ for (i=1; i<clust->ndat; i++)
+ {
+ int active = 0;
+ node_t *node = clust->first;
+ while (node)
+ {
+ if ( node->idx==i ) { active=1; break; }
+ node = node->next;
+ }
+ fprintf(stderr,"%2d%c ",i,active?'*':' ');
+ for (j=0; j<i; j++)
+ {
+ if ( PDIST(clust->pdist,i,j)==9 )
+ fprintf(stderr," ----- ");
+ else
+ fprintf(stderr," %f", PDIST(clust->pdist,i,j));
+ }
+ fprintf(stderr,"\n");
+ }
+ for (j=0; j<clust->ndat-1; j++) fprintf(stderr," %6d ",j); fprintf(stderr,"\n");
+}
+#endif
+
+hclust_t *hclust_init(int n, float *pdist)
+{
+ hclust_t *clust = (hclust_t*) calloc(1,sizeof(hclust_t));
+ clust->ndat = n;
+ clust->pdist = pdist;
+ clust->rmme = (node_t**) calloc(n*2,sizeof(node_t*));
+
+ // init clusters
+ int i;
+ for (i=0; i<clust->ndat; i++) append_node(clust,i);
+
+ // build the tree
+ while ( clust->nclust>1 )
+ {
+ // find two clusters with minimum distance
+ float min_value = HUGE_VAL;
+ node_t *iclust = clust->first->next;
+ node_t *min_iclust = NULL, *min_jclust = NULL;
+ while ( iclust )
+ {
+ node_t *jclust = clust->first;
+ while ( jclust!=iclust )
+ {
+ float value = PDIST(clust->pdist,iclust->idx,jclust->idx);
+ if ( value < min_value )
+ {
+ min_value = value;
+ min_iclust = iclust;
+ min_jclust = jclust;
+ }
+ jclust = jclust->next;
+ }
+ iclust = iclust->next;
+ }
+ assert( min_iclust && min_jclust ); // pdist contains inf or nan, fix the caller
+ remove_node(clust,min_iclust);
+ remove_node(clust,min_jclust);
+
+ // update the pairwise distances. We keep the matrix and as we are moving up the
+ // tree, we use fewer columns/rows as the number of clusters decreases: we reuse
+ // i-th and leave j-th unused. Inter-cluster distance is defined as maximum distance
+ // between pairwise distances of elements within the cluster.
+ iclust = clust->first;
+ while ( iclust )
+ {
+ if ( PDIST(clust->pdist,iclust->idx,min_iclust->idx) < PDIST(clust->pdist,iclust->idx,min_jclust->idx) )
+ PDIST(clust->pdist,iclust->idx,min_iclust->idx) = PDIST(clust->pdist,iclust->idx,min_jclust->idx);
+ iclust = iclust->next;
+ }
+
+ node_t *node = append_node(clust,min_iclust->idx);
+ node->akid = min_iclust;
+ node->bkid = min_jclust;
+ node->value = min_value;
+ node->akid->parent = node;
+ node->bkid->parent = node;
+ }
+
+ return clust;
+}
+void hclust_destroy(hclust_t *clust)
+{
+ int i;
+ for (i=0; i<clust->nrmme; i++) free(clust->rmme[i]);
+ free(clust->rmme);
+ free(clust->dbg);
+ free(clust->str.s);
+ free(clust);
+}
+
+char *hclust_create_dot(hclust_t *clust, char **labels, float th)
+{
+ clust->str.l = 0;
+ ksprintf(&clust->str,"digraph myGraph {");
+
+ int i;
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->value )
+ ksprintf(&clust->str,"\"%d\" [label=\"%f\"];", node->id,node->value);
+ else
+ ksprintf(&clust->str,"\"%d\" [label=\"%s\"];", node->id,labels[node->idx]);
+ }
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->akid )
+ {
+ if ( node->value >= th && node->akid && node->akid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->akid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->akid->id);
+ }
+
+ if ( node->bkid )
+ {
+ if ( node->value >= th && node->bkid && node->bkid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->bkid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->bkid->id);
+ }
+ }
+ ksprintf(&clust->str,"};");
+ return clust->str.s;
+}
+char **hclust_explain(hclust_t *clust, int *nlines)
+{
+ clust->ndbg = 0;
+ char *beg = clust->str.s;
+ while ( *beg )
+ {
+ char *end = beg;
+ while ( *end && *end!='\n' ) end++;
+ clust->ndbg++;
+ hts_expand(char*,clust->ndbg,clust->mdbg,clust->dbg);
+ clust->dbg[clust->ndbg-1] = beg;
+ if ( !*end ) break;
+ *end = 0;
+ beg = end + 1;
+ }
+
+ *nlines = clust->ndbg;
+ return clust->dbg;
+}
+
+cluster_t *append_cluster(node_t *node, cluster_t *cluster, int *nclust, node_t **stack)
+{
+ (*nclust)++;
+ cluster = (cluster_t*) realloc(cluster,sizeof(cluster_t)*(*nclust));
+ cluster_t *clust = &cluster[*nclust-1];
+ clust->nmemb = 0;
+ clust->memb = NULL;
+ clust->dist = node->value;
+
+ int nstack = 1;
+ stack[0] = node;
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( node->akid )
+ {
+ stack[nstack++] = akid;
+ stack[nstack++] = bkid;
+ }
+ else
+ {
+ clust->nmemb++;
+ clust->memb = (int*) realloc(clust->memb,sizeof(int)*clust->nmemb);
+ clust->memb[clust->nmemb-1] = node->id;
+ }
+ }
+ return cluster;
+}
+
+int cmp_nodes(const void *a, const void *b)
+{
+ const node_t *an = *((const node_t**) a);
+ const node_t *bn = *((const node_t**) b);
+ if ( an->value < bn->value ) return -1;
+ if ( an->value > bn->value ) return 1;
+ return 0;
+}
+
+float calc_dev(node_t **dat, int n)
+{
+ float avg = 0, dev = 0;
+ int i;
+ for (i=0; i<n; i++) avg += dat[i]->value;
+ avg /= n;
+ for (i=0; i<n; i++) dev += (dat[i]->value - avg)*(dat[i]->value - avg);
+ return sqrt(dev/n);
+}
+
+/*
+ Heuristics to determine clustering cutoff: sort nodes by distance and
+ split into two groups by minimizing the standard deviation.
+ This works best when two elements from a single different sample are
+ included in the mix.
+ - min_inter_dist .. smaller values are always considered identical
+ - max_intra_dist .. larger values are always considered different
+ */
+float hclust_set_threshold(hclust_t *clust, float min_inter_dist, float max_intra_dist)
+{
+ node_t **dat = clust->rmme + clust->ndat;
+ int i, ndat = clust->nrmme - clust->ndat;
+
+ qsort(dat, ndat, sizeof(dat), cmp_nodes);
+
+ clust->str.l = 0;
+ float th, min_dev = HUGE_VAL;
+ int imin = -1;
+ for (i=0; i<ndat; i++)
+ {
+ float dev = 0;
+ if ( i>0 ) dev += calc_dev(dat,i);
+ if ( i+1<ndat ) dev += calc_dev(dat+i,ndat-i);
+ th = dat[i]->value;
+ ksprintf(&clust->str,"DEV\t%f\t%f\n",th,dev);
+ if ( min_dev > dev && th >= min_inter_dist ) { min_dev = dev; imin = i; }
+ }
+ if ( max_intra_dist > 0 )
+ th = max_intra_dist; // use fixed cutoff, the above was only for debugging output
+ else
+ {
+ // dynamic cutoff
+ max_intra_dist = fabs(max_intra_dist);
+ th = imin==-1 ? max_intra_dist : dat[imin]->value;
+ if ( th > max_intra_dist ) th = max_intra_dist;
+ }
+ ksprintf(&clust->str,"TH\t%f\n", th);
+ ksprintf(&clust->str,"MAX_DIST\t%f\n", dat[ndat-1]->value);
+ ksprintf(&clust->str,"MIN_INTER\t%f\n", min_inter_dist);
+ ksprintf(&clust->str,"MAX_INTRA\t%f\n", max_intra_dist);
+ return th;
+}
+
+cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust)
+{
+ float cutoff = *max_intra_dist = hclust_set_threshold(clust, min_inter_dist, *max_intra_dist);
+
+ node_t **stack = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ node_t **tmp = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ stack[0] = clust->first;
+ int nstack = 1;
+
+ cluster_t *cluster = NULL;
+ int ncluster = 0;
+
+ if ( stack[0]->value < cutoff )
+ {
+ // all values are within the limits - create a single cluster
+ cluster = append_cluster(stack[0], cluster, &ncluster, tmp);
+ nstack = 0;
+ }
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( !akid )
+ {
+ cluster = append_cluster(node, cluster, &ncluster, tmp);
+ continue;
+ }
+
+ if ( node->value >= cutoff && akid->value < cutoff )
+ cluster = append_cluster(akid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = akid;
+
+ if ( node->value >= cutoff && bkid->value < cutoff )
+ cluster = append_cluster(bkid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = bkid;
+ }
+
+ free(tmp);
+ free(stack);
+
+ *nclust = ncluster;
+ return cluster;
+}
+
+void hclust_destroy_list(cluster_t *clust, int nclust)
+{
+ int i;
+ for (i=0; i<nclust; i++) free(clust[i].memb);
+ free(clust);
+}
+
+
--- /dev/null
+#include "pysam.h"
+
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <stdlib.h>
+#include "bcftools.h"
+#include "hclust.h"
+
+typedef struct _node_t
+{
+ struct _node_t *akid, *bkid, *next, *prev, *parent;
+ int id, idx; // id: unique node id; idx: current index to pdist
+ float value; // max pairwise dist of elements within the node
+}
+node_t;
+
+struct _hclust_t
+{
+ int ndat, nclust; // ndat: number of elements (pdist matrix size); nclust: current number of clusters
+ float *pdist; // pairwise cluster distances, diagonal matrix accessed via the PDIST macro
+ node_t *first, *last; // clusters are maintained in a double-linked list
+ node_t **rmme; // convenience array to remove all allocated nodes at the end
+ int nrmme;
+ kstring_t str; // (for debugging) pointer to str.s is returned by create_dot()
+ char **dbg; // (for debugging) created by create_list() via set_threshold() and returned by explain()
+ int ndbg, mdbg;
+};
+
+node_t *append_node(hclust_t *clust, int idx)
+{
+ node_t *node = (node_t*) calloc(1,sizeof(node_t));
+
+ clust->nclust++;
+ node->id = clust->nrmme;
+ node->idx = idx;
+ if ( !clust->first )
+ {
+ clust->first = node;
+ clust->last = node;
+ }
+ else
+ {
+ node->prev = clust->last;
+ clust->last->next = node;
+ clust->last = node;
+ }
+
+ if ( clust->nrmme >= clust->ndat*2 ) error("hclust fixme: %d vs %d\n",clust->nrmme,clust->ndat);
+ clust->rmme[clust->nrmme++] = node;
+
+ return node;
+}
+void remove_node(hclust_t *clust, node_t *node)
+{
+ if ( node==clust->first ) clust->first = node->next;
+ if ( node==clust->last ) clust->last = node->prev;
+ if ( node->next ) node->next->prev = node->prev;
+ if ( node->prev ) node->prev->next = node->next;
+ clust->nclust--;
+}
+
+#if DEBUG
+void hclust_debug(hclust_t *clust)
+{
+ int i;
+ fprintf(pysam_stderr,"nrmme=%d nclust=%d\n", clust->nrmme,clust->nclust);
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ int akid = node->akid ? node->akid->id : -1;
+ int bkid = node->bkid ? node->bkid->id : -1;
+ int akidx = node->akid ? node->akid->idx : -1;
+ int bkidx = node->bkid ? node->bkid->idx : -1;
+ fprintf(pysam_stderr,"\t%d\t%d\t%f\t%d %d\t%d %d\n",node->id,node->idx,node->value,akid,bkid,akidx,bkidx);
+ }
+
+ int j;
+ for (i=1; i<clust->ndat; i++)
+ {
+ int active = 0;
+ node_t *node = clust->first;
+ while (node)
+ {
+ if ( node->idx==i ) { active=1; break; }
+ node = node->next;
+ }
+ fprintf(pysam_stderr,"%2d%c ",i,active?'*':' ');
+ for (j=0; j<i; j++)
+ {
+ if ( PDIST(clust->pdist,i,j)==9 )
+ fprintf(pysam_stderr," ----- ");
+ else
+ fprintf(pysam_stderr," %f", PDIST(clust->pdist,i,j));
+ }
+ fprintf(pysam_stderr,"\n");
+ }
+ for (j=0; j<clust->ndat-1; j++) fprintf(pysam_stderr," %6d ",j); fprintf(pysam_stderr,"\n");
+}
+#endif
+
+hclust_t *hclust_init(int n, float *pdist)
+{
+ hclust_t *clust = (hclust_t*) calloc(1,sizeof(hclust_t));
+ clust->ndat = n;
+ clust->pdist = pdist;
+ clust->rmme = (node_t**) calloc(n*2,sizeof(node_t*));
+
+ // init clusters
+ int i;
+ for (i=0; i<clust->ndat; i++) append_node(clust,i);
+
+ // build the tree
+ while ( clust->nclust>1 )
+ {
+ // find two clusters with minimum distance
+ float min_value = HUGE_VAL;
+ node_t *iclust = clust->first->next;
+ node_t *min_iclust = NULL, *min_jclust = NULL;
+ while ( iclust )
+ {
+ node_t *jclust = clust->first;
+ while ( jclust!=iclust )
+ {
+ float value = PDIST(clust->pdist,iclust->idx,jclust->idx);
+ if ( value < min_value )
+ {
+ min_value = value;
+ min_iclust = iclust;
+ min_jclust = jclust;
+ }
+ jclust = jclust->next;
+ }
+ iclust = iclust->next;
+ }
+ assert( min_iclust && min_jclust ); // pdist contains inf or nan, fix the caller
+ remove_node(clust,min_iclust);
+ remove_node(clust,min_jclust);
+
+ // update the pairwise distances. We keep the matrix and as we are moving up the
+ // tree, we use fewer columns/rows as the number of clusters decreases: we reuse
+ // i-th and leave j-th unused. Inter-cluster distance is defined as maximum distance
+ // between pairwise distances of elements within the cluster.
+ iclust = clust->first;
+ while ( iclust )
+ {
+ if ( PDIST(clust->pdist,iclust->idx,min_iclust->idx) < PDIST(clust->pdist,iclust->idx,min_jclust->idx) )
+ PDIST(clust->pdist,iclust->idx,min_iclust->idx) = PDIST(clust->pdist,iclust->idx,min_jclust->idx);
+ iclust = iclust->next;
+ }
+
+ node_t *node = append_node(clust,min_iclust->idx);
+ node->akid = min_iclust;
+ node->bkid = min_jclust;
+ node->value = min_value;
+ node->akid->parent = node;
+ node->bkid->parent = node;
+ }
+
+ return clust;
+}
+void hclust_destroy(hclust_t *clust)
+{
+ int i;
+ for (i=0; i<clust->nrmme; i++) free(clust->rmme[i]);
+ free(clust->rmme);
+ free(clust->dbg);
+ free(clust->str.s);
+ free(clust);
+}
+
+char *hclust_create_dot(hclust_t *clust, char **labels, float th)
+{
+ clust->str.l = 0;
+ ksprintf(&clust->str,"digraph myGraph {");
+
+ int i;
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->value )
+ ksprintf(&clust->str,"\"%d\" [label=\"%f\"];", node->id,node->value);
+ else
+ ksprintf(&clust->str,"\"%d\" [label=\"%s\"];", node->id,labels[node->idx]);
+ }
+ for (i=0; i<clust->nrmme; i++)
+ {
+ node_t *node = clust->rmme[i];
+ if ( node->akid )
+ {
+ if ( node->value >= th && node->akid && node->akid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->akid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->akid->id);
+ }
+
+ if ( node->bkid )
+ {
+ if ( node->value >= th && node->bkid && node->bkid->value < th )
+ ksprintf(&clust->str,"\"%d\" -> \"%d\" [color=\"#D43F3A\" penwidth=3];", node->id,node->bkid->id);
+ else
+ ksprintf(&clust->str,"\"%d\" -> \"%d\";", node->id,node->bkid->id);
+ }
+ }
+ ksprintf(&clust->str,"};");
+ return clust->str.s;
+}
+char **hclust_explain(hclust_t *clust, int *nlines)
+{
+ clust->ndbg = 0;
+ char *beg = clust->str.s;
+ while ( *beg )
+ {
+ char *end = beg;
+ while ( *end && *end!='\n' ) end++;
+ clust->ndbg++;
+ hts_expand(char*,clust->ndbg,clust->mdbg,clust->dbg);
+ clust->dbg[clust->ndbg-1] = beg;
+ if ( !*end ) break;
+ *end = 0;
+ beg = end + 1;
+ }
+
+ *nlines = clust->ndbg;
+ return clust->dbg;
+}
+
+cluster_t *append_cluster(node_t *node, cluster_t *cluster, int *nclust, node_t **stack)
+{
+ (*nclust)++;
+ cluster = (cluster_t*) realloc(cluster,sizeof(cluster_t)*(*nclust));
+ cluster_t *clust = &cluster[*nclust-1];
+ clust->nmemb = 0;
+ clust->memb = NULL;
+ clust->dist = node->value;
+
+ int nstack = 1;
+ stack[0] = node;
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( node->akid )
+ {
+ stack[nstack++] = akid;
+ stack[nstack++] = bkid;
+ }
+ else
+ {
+ clust->nmemb++;
+ clust->memb = (int*) realloc(clust->memb,sizeof(int)*clust->nmemb);
+ clust->memb[clust->nmemb-1] = node->id;
+ }
+ }
+ return cluster;
+}
+
+int cmp_nodes(const void *a, const void *b)
+{
+ const node_t *an = *((const node_t**) a);
+ const node_t *bn = *((const node_t**) b);
+ if ( an->value < bn->value ) return -1;
+ if ( an->value > bn->value ) return 1;
+ return 0;
+}
+
+float calc_dev(node_t **dat, int n)
+{
+ float avg = 0, dev = 0;
+ int i;
+ for (i=0; i<n; i++) avg += dat[i]->value;
+ avg /= n;
+ for (i=0; i<n; i++) dev += (dat[i]->value - avg)*(dat[i]->value - avg);
+ return sqrt(dev/n);
+}
+
+/*
+ Heuristics to determine clustering cutoff: sort nodes by distance and
+ split into two groups by minimizing the standard deviation.
+ This works best when two elements from a single different sample are
+ included in the mix.
+ - min_inter_dist .. smaller values are always considered identical
+ - max_intra_dist .. larger values are always considered different
+ */
+float hclust_set_threshold(hclust_t *clust, float min_inter_dist, float max_intra_dist)
+{
+ node_t **dat = clust->rmme + clust->ndat;
+ int i, ndat = clust->nrmme - clust->ndat;
+
+ qsort(dat, ndat, sizeof(dat), cmp_nodes);
+
+ clust->str.l = 0;
+ float th, min_dev = HUGE_VAL;
+ int imin = -1;
+ for (i=0; i<ndat; i++)
+ {
+ float dev = 0;
+ if ( i>0 ) dev += calc_dev(dat,i);
+ if ( i+1<ndat ) dev += calc_dev(dat+i,ndat-i);
+ th = dat[i]->value;
+ ksprintf(&clust->str,"DEV\t%f\t%f\n",th,dev);
+ if ( min_dev > dev && th >= min_inter_dist ) { min_dev = dev; imin = i; }
+ }
+ if ( max_intra_dist > 0 )
+ th = max_intra_dist; // use fixed cutoff, the above was only for debugging output
+ else
+ {
+ // dynamic cutoff
+ max_intra_dist = fabs(max_intra_dist);
+ th = imin==-1 ? max_intra_dist : dat[imin]->value;
+ if ( th > max_intra_dist ) th = max_intra_dist;
+ }
+ ksprintf(&clust->str,"TH\t%f\n", th);
+ ksprintf(&clust->str,"MAX_DIST\t%f\n", dat[ndat-1]->value);
+ ksprintf(&clust->str,"MIN_INTER\t%f\n", min_inter_dist);
+ ksprintf(&clust->str,"MAX_INTRA\t%f\n", max_intra_dist);
+ return th;
+}
+
+cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust)
+{
+ float cutoff = *max_intra_dist = hclust_set_threshold(clust, min_inter_dist, *max_intra_dist);
+
+ node_t **stack = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ node_t **tmp = (node_t**) malloc(sizeof(node_t*)*clust->ndat);
+ stack[0] = clust->first;
+ int nstack = 1;
+
+ cluster_t *cluster = NULL;
+ int ncluster = 0;
+
+ if ( stack[0]->value < cutoff )
+ {
+ // all values are within the limits - create a single cluster
+ cluster = append_cluster(stack[0], cluster, &ncluster, tmp);
+ nstack = 0;
+ }
+
+ while ( nstack )
+ {
+ node_t *node = stack[--nstack];
+ node_t *akid = node->akid;
+ node_t *bkid = node->bkid;
+ if ( !akid )
+ {
+ cluster = append_cluster(node, cluster, &ncluster, tmp);
+ continue;
+ }
+
+ if ( node->value >= cutoff && akid->value < cutoff )
+ cluster = append_cluster(akid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = akid;
+
+ if ( node->value >= cutoff && bkid->value < cutoff )
+ cluster = append_cluster(bkid, cluster, &ncluster, tmp);
+ else
+ stack[nstack++] = bkid;
+ }
+
+ free(tmp);
+ free(stack);
+
+ *nclust = ncluster;
+ return cluster;
+}
+
+void hclust_destroy_list(cluster_t *clust, int nclust)
+{
+ int i;
+ for (i=0; i<nclust; i++) free(clust[i].memb);
+ free(clust);
+}
+
+
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+
+/*
+ Simple hierarchical clustering
+*/
+
+#ifndef __HCLUST_H__
+#define __HCLUST_H__
+
+#include <stdio.h>
+
+typedef struct _hclust_t hclust_t;
+
+typedef struct
+{
+ float dist;
+ int nmemb, *memb;
+}
+cluster_t;
+
+#define PDIST(mat,a,b) (mat)[((a)>(b)?((a)*((a)-1)/2+(b)):((b)*((b)-1)/2+(a)))]
+
+/*
+ * hclust_init() - init and run clustering
+ * @n: number of elements
+ * @pdist: pairwise distances. The array will be modified by hclust and
+ * must exist until hclust_destroy() is called
+ */
+hclust_t *hclust_init(int n, float *pdist);
+void hclust_destroy(hclust_t *clust);
+
+/*
+ * hclust_create_list() - returns a list of clusters
+ * @min_inter_dist: minimum inter-cluster distance. If smaller, elements are considered
+ * homogenous, belonging to the same cluster.
+ * @max_intra_dist: maximum intra-cluster distance allowed. If smaller than 0,
+ * the threshold can be heuristically lowered, otherwise considered
+ * a fixed cutoff. The pointer will be filled to the cutoff actually used.
+ */
+cluster_t *hclust_create_list(hclust_t *clust, float min_inter_dist, float *max_intra_dist, int *nclust);
+void hclust_destroy_list(cluster_t *clust, int nclust);
+
+/*
+ * Access debugging data used in the decision making process. Note that this
+ * must be called immediately after hclust_create_list because other calls,
+ * such as hclust_create_dot(), invalidate the temporary data structures.
+ */
+char **hclust_explain(hclust_t *clust, int *nlines);
+
+char *hclust_create_dot(hclust_t *clust, char **labels, float th);
+
+#endif
+
--- /dev/null
+/* The MIT License
+
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+
+ */
+/*
+ Usage example:
+
+ #include "kheap.h"
+
+ // First we prepare the user data to store, in this example it is a
+ // struct with a single element "key", and a comparator function
+ // "is_smaller". In this example the comparator defines a min heap (as
+ // opposed to a max heap).
+ typedef struct
+ {
+ uint32_t key;
+ }
+ data_t;
+ static inline int is_smaller(data_t *a, data_t *b)
+ {
+ return a->key < b->key ? 1 : 0;
+ }
+ data_t data[3] = { {3}, {2}, {1} };
+
+
+ // Heap declaration, "mh" is an arbitrary string. The typedef is not
+ // required, it is just a convenience shortcut so that we can use
+ // "heap_t" instead of the generic "khp_mh_t" automatically created by
+ // the KHEAP_INIT macro.
+ KHEAP_INIT(mh, data_t, is_smaller)
+ typedef khp_mh_t heap_t;
+
+ // Initialize the heap, insert the test data, then retrieve them back,
+ // sorted. Multiple heaps with the same name "mh" can be created and
+ // used simultaneously, as long as they all use the same data type
+ // "data_t".
+ heap_t *heap = khp_init(mh);
+
+ for (int i=0; i<3; i++)
+ khp_insert(mh, heap, &data[i]);
+
+ while (heap->ndat)
+ {
+ printf("%d\n", heap->dat[0].pos);
+ khp_delete(mh, heap);
+ }
+
+ // Clean up
+ khp_destroy(mh, heap);
+
+*/
+
+#ifndef __KHEAP_H__
+#define __KHEAP_H__
+
+#include <stdlib.h>
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+
+#define __KHEAP_TYPE(name, kheap_t) \
+ typedef struct { \
+ int ndat, mdat; \
+ kheap_t *dat; \
+ kheap_t tmp; \
+ } khp_##name##_t;
+
+#define khp_parent(i) (((i)-1)/2)
+#define khp_lchild(i) (2*(i)+1)
+#define khp_rchild(i) (2*(i)+2)
+#define khp_swap(hp,i,j) { \
+ ((hp)->tmp) = ((hp)->dat[i]); \
+ ((hp)->dat[i]) = ((hp)->dat[j]); \
+ ((hp)->dat[j]) = ((hp)->tmp); \
+ }
+
+#define __KHEAP_IMPL(name, SCOPE, kheap_t, __cmp) \
+ SCOPE khp_##name##_t *khp_init_##name(void) \
+ { \
+ return (khp_##name##_t*)calloc(1, sizeof(khp_##name##_t)); \
+ } \
+ SCOPE void khp_destroy_##name(khp_##name##_t *heap) \
+ { \
+ if (heap) free(heap->dat); \
+ free(heap); \
+ } \
+ SCOPE int khp_insert_##name(khp_##name##_t *heap, kheap_t *dat) \
+ { \
+ heap->ndat++; \
+ if ( heap->ndat > heap->mdat ) \
+ { \
+ heap->mdat = heap->ndat; \
+ kroundup32(heap->mdat); \
+ heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
+ } \
+ int i = heap->ndat - 1; \
+ while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \
+ { \
+ heap->dat[i] = heap->dat[khp_parent(i)]; \
+ i = khp_parent(i); \
+ } \
+ heap->dat[i] = *dat; \
+ return i; \
+ } \
+ SCOPE void khp_heapify_##name(khp_##name##_t *heap, int i) \
+ { \
+/*todo: loop instead of a recursive function? */ \
+ int extreme = khp_lchild(i) < heap->ndat && __cmp(&heap->dat[khp_lchild(i)],&heap->dat[i]) ? khp_lchild(i) : i; \
+ if ( khp_rchild(i) < heap->ndat && __cmp(&heap->dat[khp_rchild(i)],&heap->dat[extreme]) ) extreme = khp_rchild(i); \
+ if ( extreme != i ) \
+ { \
+ khp_swap(heap,i,extreme); \
+ khp_heapify_##name(heap,extreme); \
+ } \
+ } \
+ SCOPE void khp_delete_##name(khp_##name##_t *heap) \
+ { \
+ if ( !heap || !heap->ndat ) return; \
+ heap->dat[0] = heap->dat[--heap->ndat]; \
+ khp_heapify_##name(heap, 0); \
+ } \
+
+#define KHEAP_INIT(name, kheap_t, __cmp) \
+ __KHEAP_TYPE(name, kheap_t) \
+ __KHEAP_IMPL(name, static kh_inline klib_unused, kheap_t, __cmp)
+
+#define khp_init(name) khp_init_##name()
+#define khp_destroy(name, heap) khp_destroy_##name(heap)
+#define khp_insert(name, heap, dat) khp_insert_##name(heap, dat)
+#define khp_delete(name, heap) khp_delete_##name(heap)
+
+#endif
#endif
int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
+int main_csq(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
typedef struct
{
.alias = "cnv",
.help = "HMM CNV calling"
},
+ { .func = main_csq,
+ .alias = "csq",
+ .help = "call variation consequences"
+ },
{ .func = main_vcffilter,
.alias = "filter",
.help = "filter VCF/BCF files using fixed thresholds"
.alias = "gtcheck",
.help = "check sample concordance, detect sample swaps and contamination"
},
+ { .func = bam_mpileup,
+ .alias = "mpileup",
+ .help = "multi-way pileup producing genotype likelihoods"
+ },
#if USE_GPL
{ .func = main_polysomy,
.alias = "polysomy",
#endif
int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
+int main_csq(int argc, char *argv[]);
+int bam_mpileup(int argc, char *argv[]);
typedef struct
{
.alias = "cnv",
.help = "HMM CNV calling"
},
+ { .func = main_csq,
+ .alias = "csq",
+ .help = "call variation consequences"
+ },
{ .func = main_vcffilter,
.alias = "filter",
.help = "filter VCF/BCF files using fixed thresholds"
.alias = "gtcheck",
.help = "check sample concordance, detect sample swaps and contamination"
},
+ { .func = bam_mpileup,
+ .alias = "mpileup",
+ .help = "multi-way pileup producing genotype likelihoods"
+ },
#if USE_GPL
{ .func = main_polysomy,
.alias = "polysomy",
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
//
static void mcall_init_trios(call_t *call)
{
+ if ( call->prior_AN )
+ {
+ int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN);
+ id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC);
+ }
+
// 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
break;
}
if ( PLs[j]==bcf_int32_missing ) break;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
{
assert( PLs[j]!=bcf_int32_vector_end );
if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
}
/**
* log(sum_i exp(a_i))
*/
-static inline double logsumexp(double *vals, int nvals)
-{
- int i;
- double max_exp = vals[0];
- for (i=1; i<nvals; i++)
- if ( max_exp < vals[i] ) max_exp = vals[i];
-
- double sum = 0;
- for (i=0; i<nvals; i++)
- sum += exp(vals[i] - max_exp);
-
- return log(sum) + max_exp;
-}
+// static inline double logsumexp(double *vals, int nvals)
+// {
+// int i;
+// double max_exp = vals[0];
+// for (i=1; i<nvals; i++)
+// if ( max_exp < vals[i] ) max_exp = vals[i];
+
+// double sum = 0;
+// for (i=0; i<nvals; i++)
+// sum += exp(vals[i] - max_exp);
+
+// return log(sum) + max_exp;
+// }
/** log(exp(a)+exp(b)) */
static inline double logsumexp2(double a, double b)
{
}
// Macro to set the most likely alleles
-#define UPDATE_MAX_LKs(als) { \
+#define UPDATE_MAX_LKs(als,sum) { \
if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
- if ( lk_tot_set ) lk_sum = logsumexp2(lk_tot,lk_sum); \
+ if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
}
#define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
}
if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
else lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia);
+ UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
}
// Two alleles
int lk_tot_set = 0;
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]);
- double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fab = 2*fa*fb;
int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
double *pdg = call->pdg;
for (isample=0; isample<nsmpl; isample++)
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta;
- UPDATE_MAX_LKs(1<<ia|1<<ib);
+ UPDATE_MAX_LKs(1<<ia|1<<ib, lk_tot_set);
}
}
}
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fc2 = fc*fc;
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
int isample, icc = (ic+1)*(ic+2)/2-1;
int iac = iaa - ia + ic, ibc = ibb - ib + ic;
double *pdg = call->pdg;
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta; // the prior
if ( ic!=0 ) lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic);
+ UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic, lk_tot_set);
}
}
}
{
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
#if USE_PRIOR_FOR_GTS
if ( ia!=0 ) lk *= prior;
#endif
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
sum_lk += lk;
gls[idx] = lk;
if ( best_lk < lk )
void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
{
- int i, ret;
+ if ( nals==nout_als ) return;
+
+ int i,j, nret, size = sizeof(float);
+
+ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
+ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
- // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer,
- // so only dealing with these cases at the moment
+ // INFO fields
for (i=0; i<rec->n_info; i++)
{
bcf_info_t *info = &rec->d.info[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
- if ( vlen!=BCF_VL_R ) continue;
- int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
- if ( type!=BCF_HT_INT ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
- ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp);
- if ( ret>0 )
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key);
+ nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if ( nret<=0 ) continue;
+
+ if ( nout_als==1 )
+ bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
+ else
{
- assert( ret==nals );
- if ( out_als==1 )
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1);
- else
+ for (j=0; j<nals; j++)
{
- int j;
- for (j=0; j<nals; j++)
- {
- if ( call->als_map[j]==-1 ) continue; // to be dropped
- call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point
- }
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als);
+ int k = call->als_map[j];
+ if ( k==-1 ) continue; // to be dropped
+ memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
}
+ bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
}
}
+ // FORMAT fields
for (i=0; i<rec->n_fmt; i++)
{
bcf_fmt_t *fmt = &rec->d.fmt[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
- if ( vlen!=BCF_VL_R ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
+
int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
- if ( type!=BCF_HT_INT ) continue;
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
+ nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if (nret<=0) continue;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
- ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp);
- if ( ret>0 )
- {
- int j, nsmpl = bcf_hdr_nsamples(call->hdr);
- int ndp = ret / nsmpl;
- assert( ndp==nals );
- if ( out_als==1 )
- {
- for (j=0; j<nsmpl; j++)
- call->PLs[j] = call->itmp[j*ndp];
+ assert( nret==nals*nsmpl );
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl);
- }
- else
+ for (j=0; j<nsmpl; j++)
+ {
+ char *ptr_src = (char *)tmp_ori + j*nals*size;
+ char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+ int k;
+ for (k=0; k<nals; k++)
{
- int k;
- for (j=0; j<nsmpl; j++)
- {
- int32_t *dp_dst = call->PLs + j*nout_als;
- int32_t *dp_src = call->itmp + j*ndp;
- for (k=0; k<nals; k++)
- {
- if ( call->als_map[k]==-1 ) continue; // to be dropped
- dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point
- }
- }
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als);
+ int l = call->als_map[k];
+ if ( l==-1 ) continue; // to be dropped
+ memcpy(ptr_dst+size*l, ptr_src+size*k, size);
}
}
+ bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
}
+
+ call->PLs = (int32_t*) tmp_new;
+ call->mPLs = ntmp_new;
+ call->itmp = (int32_t*) tmp_ori;
+ call->n_itmp = ntmp_ori;
}
// NB: in this function we temporarily use calls->als_map for a different
// purpose to store mapping from new (target) alleles to original alleles.
//
-static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
+static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
bcf_sr_regions_t *tgt = call->srs->targets;
if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
call->als[nals] = tgt->als[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
- if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]);
+ if ( j+1==*unseen ) { fprintf(stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; }
if ( j>=0 )
{
nals++;
}
- if ( !has_new && nals==rec->n_allele ) return;
+ if ( !has_new && nals==rec->n_allele ) return 0;
bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
// create mapping from new PL to old PL
bcf_update_info_float(call->hdr, rec, "QS", qsum, nals);
if ( *unseen ) *unseen = nals-1;
+ return 0;
}
int i, unseen = call->unseen;
// Force alleles when calling genotypes given alleles was requested
- if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen);
+ if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
int nsmpl = bcf_hdr_nsamples(call->hdr);
int nals = rec->n_allele;
#if QS_FROM_PDG
estimate_qsum(call, rec);
#else
- // Get sum of qualities
+ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
if ( nqs < nals )
hts_expand(float,nals,call->nqsum,call->qsum);
for (i=nqs; i<nals; i++) call->qsum[i] = 0;
}
- float qsum_tot = 0;
- for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
- if ( !call->qsum[0] )
+
+ // If available, take into account reference panel AFs
+ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
- // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
- // an equivalent of a single reference read.
- if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
- error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
- if ( call->itmp[0] )
+ int an = call->ac[0];
+ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
{
- call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
- qsum_tot += call->qsum[0];
+ int ac0 = an; // number of alleles in the reference population
+ for (i=0; i<nals-1; i++)
+ {
+ if ( call->ac[i]==bcf_int32_vector_end ) break;
+ if ( call->ac[i]==bcf_int32_missing ) continue;
+ ac0 -= call->ac[i];
+ call->qsum[i+1] += call->ac[i]*0.5;
+ }
+ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
+ call->qsum[0] += ac0*0.5;
+ for (i=0; i<nals; i++) call->qsum[i] /= nsmpl + 0.5*an;
}
}
+
+ float qsum_tot = 0;
+ for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
+
+ // Is this still necessary??
+ //
+ // if (0&& !call->qsum[0] )
+ // {
+ // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
+ // // an equivalent of a single reference read.
+ // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
+ // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
+ // if ( call->itmp[0] )
+ // {
+ // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
+ // qsum_tot += call->qsum[0];
+ // }
+ // }
+
if ( qsum_tot ) for (i=0; i<nals; i++) call->qsum[i] /= qsum_tot;
#endif
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
// Find the best combination of alleles
int out_als, nout;
if ( nals > 8*sizeof(out_als) )
if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
// Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum));
+ rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
}
else
{
// Set the quality of a REF site
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum));
+ if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ rec->qual = call->theta ? -4.343*call->theta : 0;
+ else
+ rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
}
+
if ( rec->qual>999 ) rec->qual = 999;
if ( rec->qual>50 ) rec->qual = rint(rec->qual);
}
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
- bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
return nout;
}
/* mcall.c -- multiallelic and rare variant calling.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
//
static void mcall_init_trios(call_t *call)
{
+ if ( call->prior_AN )
+ {
+ int id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AN);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AN);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AN);
+ id = bcf_hdr_id2int(call->hdr,BCF_DT_ID,call->prior_AC);
+ if ( id==-1 ) error("No such tag \"%s\"\n", call->prior_AC);
+ if ( !bcf_hdr_idinfo_exists(call->hdr,BCF_HL_FMT,id) ) error("No such FORMAT tag \"%s\"\n", call->prior_AC);
+ }
+
// 23, 138, 478 possible diploid trio genotypes with 2, 3, 4 alleles
call->ntrio[FTYPE_222][2] = 15; call->ntrio[FTYPE_222][3] = 78; call->ntrio[FTYPE_222][4] = 250;
call->ntrio[FTYPE_121][2] = 8; call->ntrio[FTYPE_121][3] = 27; call->ntrio[FTYPE_121][4] = 64;
break;
}
if ( PLs[j]==bcf_int32_missing ) break;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
{
assert( PLs[j]!=bcf_int32_vector_end );
if ( PLs[j]==bcf_int32_missing ) PLs[j] = 255;
- assert( PLs[j]<256 );
- pdg[j] = pl2p[ PLs[j] ];
+ pdg[j] = PLs[j] < 256 ? pl2p[PLs[j]] : pow(10., -PLs[j]/10.);
sum += pdg[j];
}
}
/**
* log(sum_i exp(a_i))
*/
-static inline double logsumexp(double *vals, int nvals)
-{
- int i;
- double max_exp = vals[0];
- for (i=1; i<nvals; i++)
- if ( max_exp < vals[i] ) max_exp = vals[i];
-
- double sum = 0;
- for (i=0; i<nvals; i++)
- sum += exp(vals[i] - max_exp);
-
- return log(sum) + max_exp;
-}
+// static inline double logsumexp(double *vals, int nvals)
+// {
+// int i;
+// double max_exp = vals[0];
+// for (i=1; i<nvals; i++)
+// if ( max_exp < vals[i] ) max_exp = vals[i];
+
+// double sum = 0;
+// for (i=0; i<nvals; i++)
+// sum += exp(vals[i] - max_exp);
+
+// return log(sum) + max_exp;
+// }
/** log(exp(a)+exp(b)) */
static inline double logsumexp2(double a, double b)
{
}
// Macro to set the most likely alleles
-#define UPDATE_MAX_LKs(als) { \
+#define UPDATE_MAX_LKs(als,sum) { \
if ( max_lk<lk_tot ) { max_lk = lk_tot; max_als = (als); } \
- if ( lk_tot_set ) lk_sum = logsumexp2(lk_tot,lk_sum); \
+ if ( sum ) lk_sum = logsumexp2(lk_tot,lk_sum); \
}
#define SWAP(type_t,x,y) {type_t tmp; tmp = x; x = y; y = tmp; }
}
if ( ia==0 ) ref_lk = lk_tot; // likelihood of 0/0 for all samples
else lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia);
+ UPDATE_MAX_LKs(1<<ia, ia>0 && lk_tot_set);
}
// Two alleles
int lk_tot_set = 0;
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]);
- double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fab = 2*fa*fb;
int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
double *pdg = call->pdg;
for (isample=0; isample<nsmpl; isample++)
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fab*pdg[iab];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fab*pdg[iab];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
}
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta;
- UPDATE_MAX_LKs(1<<ia|1<<ib);
+ UPDATE_MAX_LKs(1<<ia|1<<ib, lk_tot_set);
}
}
}
double fa = call->qsum[ia]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fb = call->qsum[ib]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
double fc = call->qsum[ic]/(call->qsum[ia]+call->qsum[ib]+call->qsum[ic]);
- double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ double fa2 = fa*fa;
+ double fb2 = fb*fb;
+ double fc2 = fc*fc;
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc;
int isample, icc = (ic+1)*(ic+2)/2-1;
int iac = iaa - ia + ic, ibc = ibb - ib + ic;
double *pdg = call->pdg;
{
double val = 0;
if ( !call->ploidy || call->ploidy[isample]==2 )
- val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
+ val = fa2*pdg[iaa] + fb2*pdg[ibb] + fc2*pdg[icc] + fab*pdg[iab] + fac*pdg[iac] + fbc*pdg[ibc];
else if ( call->ploidy && call->ploidy[isample]==1 )
val = fa*pdg[iaa] + fb*pdg[ibb] + fc*pdg[icc];
if ( val ) { lk_tot += log(val); lk_tot_set = 1; }
if ( ia!=0 ) lk_tot += call->theta; // the prior
if ( ib!=0 ) lk_tot += call->theta; // the prior
if ( ic!=0 ) lk_tot += call->theta; // the prior
- UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic);
+ UPDATE_MAX_LKs(1<<ia|1<<ib|1<<ic, lk_tot_set);
}
}
}
{
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = (ia+1)*(ia+2)/2-1; // PL index of the ia/ia genotype
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
#if USE_PRIOR_FOR_GTS
if ( ia!=0 ) lk *= prior;
#endif
if ( !(out_als & 1<<ia) ) continue; // ia-th allele not in the final selection, skip
int iaa = bcf_alleles2gt(ia,ia); // PL index of the ia/ia genotype
int idx = bcf_alleles2gt(call->als_map[ia],call->als_map[ia]);
- double lk = pdg[iaa]*call->qsum[ia]*call->qsum[ia];
+ double lk = ploidy==2 ? pdg[iaa]*call->qsum[ia]*call->qsum[ia] : pdg[iaa]*call->qsum[ia];
sum_lk += lk;
gls[idx] = lk;
if ( best_lk < lk )
void mcall_trim_numberR(call_t *call, bcf1_t *rec, int nals, int nout_als, int out_als)
{
- int i, ret;
+ if ( nals==nout_als ) return;
+
+ int i,j, nret, size = sizeof(float);
+
+ void *tmp_ori = call->itmp, *tmp_new = call->PLs; // reusing PLs storage which is not used at this point
+ int ntmp_ori = call->n_itmp, ntmp_new = call->mPLs;
- // at the moment we have DPR,AD,ADF,ADR all Number=R,Type=Integer,
- // so only dealing with these cases at the moment
+ // INFO fields
for (i=0; i<rec->n_info; i++)
{
bcf_info_t *info = &rec->d.info[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_INFO,info->key);
- if ( vlen!=BCF_VL_R ) continue;
- int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
- if ( type!=BCF_HT_INT ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
- ret = bcf_get_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), &call->itmp, &call->n_itmp);
- if ( ret>0 )
+ int type = bcf_hdr_id2type(call->hdr,BCF_HL_INFO,info->key);
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key);
+ nret = bcf_get_info_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if ( nret<=0 ) continue;
+
+ if ( nout_als==1 )
+ bcf_update_info_int32(call->hdr, rec, key, tmp_ori, 1); // has to be the REF, the order could not change
+ else
{
- assert( ret==nals );
- if ( out_als==1 )
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->itmp, 1);
- else
+ for (j=0; j<nals; j++)
{
- int j;
- for (j=0; j<nals; j++)
- {
- if ( call->als_map[j]==-1 ) continue; // to be dropped
- call->PLs[ call->als_map[j] ] = call->itmp[j]; // reusing PLs storage which is not used at this point
- }
- bcf_update_info_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,info->key), call->PLs, nout_als);
+ int k = call->als_map[j];
+ if ( k==-1 ) continue; // to be dropped
+ memcpy((char *)tmp_new+size*k, (char *)tmp_ori+size*j, size);
}
+ bcf_update_info_int32(call->hdr, rec, key, tmp_new, nout_als);
}
}
+ // FORMAT fields
for (i=0; i<rec->n_fmt; i++)
{
bcf_fmt_t *fmt = &rec->d.fmt[i];
int vlen = bcf_hdr_id2length(call->hdr,BCF_HL_FMT,fmt->id);
- if ( vlen!=BCF_VL_R ) continue;
+ if ( vlen!=BCF_VL_R ) continue; // not a Number=R tag
+
int type = bcf_hdr_id2type(call->hdr,BCF_HL_FMT,fmt->id);
- if ( type!=BCF_HT_INT ) continue;
+ const char *key = bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id);
+ nret = bcf_get_format_values(call->hdr, rec, key, &tmp_ori, &ntmp_ori, type);
+ if (nret<=0) continue;
+ int nsmpl = bcf_hdr_nsamples(call->hdr);
- ret = bcf_get_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), &call->itmp, &call->n_itmp);
- if ( ret>0 )
- {
- int j, nsmpl = bcf_hdr_nsamples(call->hdr);
- int ndp = ret / nsmpl;
- assert( ndp==nals );
- if ( out_als==1 )
- {
- for (j=0; j<nsmpl; j++)
- call->PLs[j] = call->itmp[j*ndp];
+ assert( nret==nals*nsmpl );
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl);
- }
- else
+ for (j=0; j<nsmpl; j++)
+ {
+ char *ptr_src = (char *)tmp_ori + j*nals*size;
+ char *ptr_dst = (char *)tmp_new + j*nout_als*size;
+ int k;
+ for (k=0; k<nals; k++)
{
- int k;
- for (j=0; j<nsmpl; j++)
- {
- int32_t *dp_dst = call->PLs + j*nout_als;
- int32_t *dp_src = call->itmp + j*ndp;
- for (k=0; k<nals; k++)
- {
- if ( call->als_map[k]==-1 ) continue; // to be dropped
- dp_dst[ call->als_map[k] ] = dp_src[k]; // reusing PLs storage which is not used at this point
- }
- }
- bcf_update_format_int32(call->hdr, rec, bcf_hdr_int2id(call->hdr,BCF_DT_ID,fmt->id), call->PLs, nsmpl*nout_als);
+ int l = call->als_map[k];
+ if ( l==-1 ) continue; // to be dropped
+ memcpy(ptr_dst+size*l, ptr_src+size*k, size);
}
}
+ bcf_update_format_int32(call->hdr, rec, key, tmp_new, nout_als*nsmpl);
}
+
+ call->PLs = (int32_t*) tmp_new;
+ call->mPLs = ntmp_new;
+ call->itmp = (int32_t*) tmp_ori;
+ call->n_itmp = ntmp_ori;
}
// NB: in this function we temporarily use calls->als_map for a different
// purpose to store mapping from new (target) alleles to original alleles.
//
-static void mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
+static int mcall_constrain_alleles(call_t *call, bcf1_t *rec, int *unseen)
{
bcf_sr_regions_t *tgt = call->srs->targets;
if ( tgt->nals>5 ) error("Maximum accepted number of alleles is 5, got %d\n", tgt->nals);
call->als[nals] = tgt->als[i];
j = vcmp_find_allele(call->vcmp, rec->d.allele+1, rec->n_allele - 1, tgt->als[i]);
- if ( j+1==*unseen ) error("Cannot constrain to %s\n",tgt->als[i]);
+ if ( j+1==*unseen ) { fprintf(pysam_stderr,"fixme? Cannot constrain to %s\n",tgt->als[i]); return -1; }
if ( j>=0 )
{
nals++;
}
- if ( !has_new && nals==rec->n_allele ) return;
+ if ( !has_new && nals==rec->n_allele ) return 0;
bcf_update_alleles(call->hdr, rec, (const char**)call->als, nals);
// create mapping from new PL to old PL
bcf_update_info_float(call->hdr, rec, "QS", qsum, nals);
if ( *unseen ) *unseen = nals-1;
+ return 0;
}
int i, unseen = call->unseen;
// Force alleles when calling genotypes given alleles was requested
- if ( call->flag & CALL_CONSTR_ALLELES ) mcall_constrain_alleles(call, rec, &unseen);
+ if ( call->flag & CALL_CONSTR_ALLELES && mcall_constrain_alleles(call, rec, &unseen)!=0 ) return -2;
int nsmpl = bcf_hdr_nsamples(call->hdr);
int nals = rec->n_allele;
#if QS_FROM_PDG
estimate_qsum(call, rec);
#else
- // Get sum of qualities
+ // Get sum of qualities, serves as an AF estimate, f_x = QS/N in Eq. 1 in call-m math notes.
int nqs = bcf_get_info_float(call->hdr, rec, "QS", &call->qsum, &call->nqsum);
if ( nqs<=0 ) error("The QS annotation not present at %s:%d\n", bcf_seqname(call->hdr,rec),rec->pos+1);
if ( nqs < nals )
hts_expand(float,nals,call->nqsum,call->qsum);
for (i=nqs; i<nals; i++) call->qsum[i] = 0;
}
- float qsum_tot = 0;
- for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
- if ( !call->qsum[0] )
+
+ // If available, take into account reference panel AFs
+ if ( call->prior_AN && bcf_get_info_int32(call->hdr, rec, call->prior_AN ,&call->ac, &call->nac)==1 )
{
- // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
- // an equivalent of a single reference read.
- if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
- error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
- if ( call->itmp[0] )
+ int an = call->ac[0];
+ if ( bcf_get_info_int32(call->hdr, rec, call->prior_AC ,&call->ac, &call->nac)==nals-1 )
{
- call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
- qsum_tot += call->qsum[0];
+ int ac0 = an; // number of alleles in the reference population
+ for (i=0; i<nals-1; i++)
+ {
+ if ( call->ac[i]==bcf_int32_vector_end ) break;
+ if ( call->ac[i]==bcf_int32_missing ) continue;
+ ac0 -= call->ac[i];
+ call->qsum[i+1] += call->ac[i]*0.5;
+ }
+ if ( ac0<0 ) error("Incorrect %s,%s values at %s:%d\n", call->prior_AN,call->prior_AC,bcf_seqname(call->hdr,rec),rec->pos+1);
+ call->qsum[0] += ac0*0.5;
+ for (i=0; i<nals; i++) call->qsum[i] /= nsmpl + 0.5*an;
}
}
+
+ float qsum_tot = 0;
+ for (i=0; i<nals; i++) qsum_tot += call->qsum[i];
+
+ // Is this still necessary??
+ //
+ // if (0&& !call->qsum[0] )
+ // {
+ // // As P(RR)!=0 even for QS(ref)=0, we set QS(ref) to a small value,
+ // // an equivalent of a single reference read.
+ // if ( bcf_get_info_int32(call->hdr, rec, "DP", &call->itmp, &call->n_itmp)!=1 )
+ // error("Could not read DP at %s:%d\n", call->hdr->id[BCF_DT_CTG][rec->rid].key,rec->pos+1);
+ // if ( call->itmp[0] )
+ // {
+ // call->qsum[0] = 1.0 / call->itmp[0] / nsmpl;
+ // qsum_tot += call->qsum[0];
+ // }
+ // }
+
if ( qsum_tot ) for (i=0; i<nals; i++) call->qsum[i] /= qsum_tot;
#endif
+ bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
+
// Find the best combination of alleles
int out_als, nout;
if ( nals > 8*sizeof(out_als) )
if ( hob != HUGE_VAL ) bcf_update_info_float(call->hdr, rec, "HOB", &hob, 1);
// Quality of a variant site. fabs() to avoid negative zeros in VCF output when CALL_KEEPALT is set
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : fabs(-4.343*(call->ref_lk - call->lk_sum));
+ rec->qual = -4.343*(call->ref_lk - logsumexp2(call->lk_sum,call->ref_lk));
}
else
{
// Set the quality of a REF site
- rec->qual = call->lk_sum==-HUGE_VAL || call->ref_lk==0 ? 0 : -4.343*log(1 - exp(call->ref_lk - call->lk_sum));
+ if ( call->lk_sum==-HUGE_VAL ) // no support from (high quality) reads, so QUAL=1-prior
+ rec->qual = call->theta ? -4.343*call->theta : 0;
+ else
+ rec->qual = -4.343*(call->lk_sum - logsumexp2(call->lk_sum,call->ref_lk));
}
+
if ( rec->qual>999 ) rec->qual = 999;
if ( rec->qual>50 ) rec->qual = rint(rec->qual);
}
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
- bcf_update_info_int32(call->hdr, rec, "QS", NULL, 0); // remove QS tag
return nout;
}
--- /dev/null
+/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
+
+ Copyright (C) 2008-2017 Genome Research Ltd.
+ Portions copyright (C) 2009-2012 Broad Institute.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <strings.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <getopt.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <assert.h>
+#include "regidx.h"
+#include "bcftools.h"
+#include "bam2bcf.h"
+#include "bam_sample.h"
+#include "gvcf.h"
+
+#define MPLP_BCF 1
+#define MPLP_VCF (1<<1)
+#define MPLP_NO_COMP (1<<2)
+#define MPLP_NO_ORPHAN (1<<3)
+#define MPLP_REALN (1<<4)
+#define MPLP_NO_INDEL (1<<5)
+#define MPLP_REDO_BAQ (1<<6)
+#define MPLP_ILLUMINA13 (1<<7)
+#define MPLP_IGNORE_RG (1<<8)
+#define MPLP_PRINT_POS (1<<9)
+#define MPLP_PRINT_MAPQ (1<<10)
+#define MPLP_PER_SAMPLE (1<<11)
+#define MPLP_SMART_OVERLAPS (1<<12)
+
+typedef struct _mplp_aux_t mplp_aux_t;
+typedef struct _mplp_pileup_t mplp_pileup_t;
+
+// Data shared by all bam files
+typedef struct {
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int rflag_require, rflag_filter, output_type;
+ int openQ, extQ, tandemQ, min_support; // for indels
+ double min_frac; // for indels
+ char *reg_fname, *pl_list, *fai_fname, *output_fname;
+ int reg_is_file, record_cmd_line, n_threads;
+ faidx_t *fai;
+ regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
+ regitr_t *bed_itr, *reg_itr;
+ int bed_logic; // 1: include region, 0: exclude region
+ gvcf_t *gvcf;
+
+ // auxiliary structures for calling
+ bcf_callaux_t *bca;
+ bcf_callret1_t *bcr;
+ bcf_call_t bc;
+ bam_mplp_t iter;
+ mplp_aux_t **mplp_data;
+ int nfiles;
+ char **files;
+ mplp_pileup_t *gplp;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+ bam_smpl_t *bsmpl;
+ kstring_t buf;
+ bcf1_t *bcf_rec;
+ htsFile *bcf_fp;
+ bcf_hdr_t *bcf_hdr;
+ int argc;
+ char **argv;
+} mplp_conf_t;
+
+typedef struct {
+ char *ref[2];
+ int ref_id[2];
+ int ref_len[2];
+} mplp_ref_t;
+
+#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
+
+// Data specific to each bam file
+struct _mplp_aux_t {
+ samFile *fp;
+ hts_itr_t *iter;
+ bam_hdr_t *h;
+ mplp_ref_t *ref;
+ const mplp_conf_t *conf;
+ int bam_id;
+ hts_idx_t *idx; // maintained only with more than one -r regions
+};
+
+// Data passed to htslib/mpileup
+struct _mplp_pileup_t {
+ int n;
+ int *n_plp, *m_plp;
+ bam_pileup1_t **plp;
+};
+
+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
+ mplp_ref_t *r = ma->ref;
+
+ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
+
+ if (!r || !ma->conf->fai) {
+ *ref = NULL;
+ return 0;
+ }
+
+ // Do we need to reference count this so multiple mplp_aux_t can
+ // track which references are in use?
+ // For now we just cache the last two. Sufficient?
+ if (tid == r->ref_id[0]) {
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+ if (tid == r->ref_id[1]) {
+ // Last, swap over
+ int tmp;
+ tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
+ tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
+
+ char *tc;
+ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+
+ // New, so migrate to old and load new
+ free(r->ref[1]);
+ r->ref[1] = r->ref[0];
+ r->ref_id[1] = r->ref_id[0];
+ r->ref_len[1] = r->ref_len[0];
+
+ r->ref_id[0] = tid;
+ r->ref[0] = faidx_fetch_seq(ma->conf->fai,
+ ma->h->target_name[r->ref_id[0]],
+ 0,
+ INT_MAX,
+ &r->ref_len[0]);
+
+ if (!r->ref[0]) {
+ r->ref[0] = NULL;
+ r->ref_id[0] = -1;
+ r->ref_len[0] = 0;
+ *ref = NULL;
+ return 0;
+ }
+
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+}
+
+static int mplp_func(void *data, bam1_t *b)
+{
+ char *ref;
+ mplp_aux_t *ma = (mplp_aux_t*)data;
+ int ret, ref_len;
+ while (1)
+ {
+ int has_ref;
+ ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
+ if (ret < 0) break;
+ // The 'B' cigar operation is not part of the specification, considering as obsolete.
+ // bam_remove_B(b);
+ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
+ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
+ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
+ if (ma->conf->bed)
+ {
+ // test overlap
+ regitr_t *itr = ma->conf->bed_itr;
+ int beg = b->core.pos, end = bam_endpos(b)-1;
+ int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
+ if ( !ma->conf->bed_logic && !overlap )
+ {
+ // exclude only reads which are fully contained in the region
+ while ( regitr_overlap(itr) )
+ {
+ if ( beg < itr->beg ) { overlap = 1; break; }
+ if ( end > itr->end ) { overlap = 1; break; }
+ }
+ }
+ if ( !overlap ) continue;
+ }
+ if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
+
+ if (ma->conf->fai && b->core.tid >= 0) {
+ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
+ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
+ fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ __func__, b->core.pos, ref_len, b->core.tid);
+ continue;
+ }
+ } else {
+ has_ref = 0;
+ }
+
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && ma->conf->capQ_thres > 10) {
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
+ if (q < 0) continue; // skip
+ else if (b->core.qual > q) b->core.qual = q;
+ }
+ if (b->core.qual < ma->conf->min_mq) continue;
+ else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;
+
+ return ret;
+ };
+ return ret;
+}
+
+// Called once per new bam added to the pileup.
+// We cache sample information here so we don't have to keep recomputing this
+// on each and every pileup column.
+//
+// Cd is an arbitrary block of data we can write into, which ends up in
+// the pileup structures. We stash the sample ID there.
+static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ mplp_aux_t *ma = (mplp_aux_t *)data;
+ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+ return 0;
+}
+
+static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, j;
+ memset(m->n_plp, 0, m->n * sizeof(int));
+ for (i = 0; i < n; ++i) // iterate over all bams
+ {
+ for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
+ {
+ const bam_pileup1_t *p = plp[i] + j;
+ int id = p->cd.i;
+ if (m->n_plp[id] == m->m_plp[id])
+ {
+ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
+ m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
+ }
+ m->plp[id][m->n_plp[id]++] = *p;
+ }
+ }
+}
+
+static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ if ( !conf->gvcf )
+ {
+ if ( rec ) bcf_write1(fp, hdr, rec);
+ return;
+ }
+
+ if ( !rec )
+ {
+ gvcf_write(conf->gvcf, fp, hdr, NULL, 0);
+ return;
+ }
+
+ int is_ref = 0;
+ if ( rec->n_allele==1 ) is_ref = 1;
+ else if ( rec->n_allele==2 )
+ {
+ // second allele is mpileup's X, not a variant
+ if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1;
+ }
+ rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref);
+ if ( rec ) bcf_write1(fp,hdr,rec);
+}
+
+static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
+{
+ bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
+
+ int ret, i, tid, pos, ref_len;
+ char *ref;
+
+ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
+ {
+ if ( end && (pos<beg || pos>end) ) continue;
+ if ( conf->bed && tid >= 0 )
+ {
+ int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL);
+ if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
+ if ( !overlap ) continue;
+ }
+ mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+
+ int total_depth, _ref0, ref16;
+ for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
+ group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp);
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = seq_nt16_table[_ref0];
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i);
+ conf->bc.tid = tid; conf->bc.pos = pos;
+ bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+
+ // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
+ // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+ && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+ {
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ {
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ }
+ }
+ }
+ return 0;
+}
+
+static int mpileup(mplp_conf_t *conf)
+{
+ if (conf->nfiles == 0) {
+ fprintf(stderr,"[%s] no input file/data given\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ mplp_ref_t mp_ref = MPLP_REF_INIT;
+ conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
+ conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
+ conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
+ conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));
+
+ // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
+ // must be kept in the memory for the whole time which can be a problem with many bams.
+ // Therefore if none or only one region is requested, we initialize the bam iterator as
+ // before and free the index. Only when multiple regions are queried, we keep the index.
+ int nregs = 0;
+ if ( conf->reg_fname )
+ {
+ if ( conf->reg_is_file )
+ {
+ conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
+ if ( !conf->reg ) {
+ fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else
+ {
+ conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
+ fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ nregs = regidx_nregs(conf->reg);
+ conf->reg_itr = regitr_init(conf->reg);
+ regitr_loop(conf->reg_itr); // region iterator now positioned at the first region
+ }
+
+ // read the header of each file in the list and initialize data
+ // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
+ bam_hdr_t *hdr = NULL; // header of first file in input list
+ int i;
+ for (i = 0; i < conf->nfiles; ++i) {
+ bam_hdr_t *h_tmp;
+ conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
+ conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
+ if ( !conf->mplp_data[i]->fp )
+ {
+ fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ exit(EXIT_FAILURE);
+ }
+ if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->conf = conf;
+ conf->mplp_data[i]->ref = &mp_ref;
+ h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
+ if ( !h_tmp ) {
+ fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
+ conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
+ if ( conf->mplp_data[i]->bam_id<0 )
+ {
+ // no usable readgroups in this bam, it can be skipped
+ sam_close(conf->mplp_data[i]->fp);
+ free(conf->mplp_data[i]);
+ bam_hdr_destroy(h_tmp);
+ free(conf->files[i]);
+ if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
+ conf->nfiles--;
+ i--;
+ continue;
+ }
+ if (conf->reg) {
+ hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
+ if (idx == NULL) {
+ fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ if ( nregs==1 ) // no need to keep the index in memory
+ hts_idx_destroy(idx);
+ else
+ conf->mplp_data[i]->idx = idx;
+ }
+
+ if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
+ else {
+ // FIXME: check consistency between h and h_tmp
+ bam_hdr_destroy(h_tmp);
+
+ // we store only the first file's header; it's (alleged to be)
+ // compatible with the i-th file's target_name lookup needs
+ conf->mplp_data[i]->h = hdr;
+ }
+ }
+ // allocate data storage proportionate to number of samples being studied sm->n
+ bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
+ conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
+
+ fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
+ // write the VCF header
+ conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+ if (conf->bcf_fp == NULL) {
+ fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
+
+ // BCF header creation
+ conf->bcf_hdr = bcf_hdr_init("w");
+ conf->buf.l = 0;
+
+ if (conf->record_cmd_line)
+ {
+ ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
+ for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
+ kputc('\n', &conf->buf);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ if (conf->fai_fname)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ // Translate BAM @SQ tags to BCF ##contig tags
+ // todo: use/write new BAM header manipulation routines, fill also UR, M5
+ for (i=0; i<hdr->n_targets; i++)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+ conf->buf.l = 0;
+
+ bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+#if CDF_MWU_TESTS
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
+#endif
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
+ if ( conf->fmt_flag&B2B_FMT_DP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DV )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_INFO_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_FMT_DP4 )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
+ if ( conf->fmt_flag&B2B_FMT_SP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
+ if ( conf->fmt_flag&B2B_FMT_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
+ if ( conf->fmt_flag&B2B_FMT_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_FMT_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
+ if ( conf->fmt_flag&B2B_INFO_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
+ if ( conf->fmt_flag&B2B_INFO_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_INFO_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
+ if ( conf->gvcf )
+ gvcf_update_header(conf->gvcf, conf->bcf_hdr);
+
+ int nsmpl;
+ const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
+ for (i=0; i<nsmpl; i++)
+ bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
+ bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr);
+
+ conf->bca = bcf_call_init(-1., conf->min_baseQ);
+ conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
+ conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+ conf->bca->min_frac = conf->min_frac;
+ conf->bca->min_support = conf->min_support;
+ conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
+
+ conf->bc.bcf_hdr = conf->bcf_hdr;
+ conf->bc.n = nsmpl;
+ conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+ if (conf->fmt_flag)
+ {
+ assert( sizeof(float)==sizeof(int32_t) );
+ conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
+ conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
+ if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
+ {
+ // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
+ conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ for (i=0; i<nsmpl; i++)
+ {
+ conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
+ conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
+ }
+ }
+ }
+
+ // init mpileup
+ conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
+ if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
+ if ( (double)conf->max_depth * conf->nfiles > 1<<20)
+ fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
+ if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
+ fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
+ bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
+ conf->max_indel_depth = conf->max_indel_depth * nsmpl;
+ conf->bcf_rec = bcf_init1();
+ bam_mplp_constructor(conf->iter, pileup_constructor);
+
+ // Run mpileup for multiple regions
+ if ( nregs )
+ {
+ int ireg = 0;
+ do
+ {
+ // first region is already positioned
+ if ( ireg++ > 0 )
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end);
+
+ for (i=0; i<conf->nfiles; i++)
+ {
+ hts_itr_destroy(conf->mplp_data[i]->iter);
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ bam_mplp_reset(conf->iter);
+ }
+ }
+ mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+ }
+ while ( regitr_loop(conf->reg_itr) );
+ }
+ else
+ mpileup_reg(conf,0,0);
+
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
+
+ // clean up
+ free(conf->bc.tmp.s);
+ bcf_destroy1(conf->bcf_rec);
+ if (conf->bcf_fp)
+ {
+ hts_close(conf->bcf_fp);
+ bcf_hdr_destroy(conf->bcf_hdr);
+ bcf_call_destroy(conf->bca);
+ free(conf->bc.PL);
+ free(conf->bc.DP4);
+ free(conf->bc.ADR);
+ free(conf->bc.ADF);
+ free(conf->bc.fmt_arr);
+ free(conf->bcr);
+ }
+ if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
+ free(conf->buf.s);
+ for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
+ free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
+ bam_mplp_destroy(conf->iter);
+ bam_hdr_destroy(hdr);
+ for (i = 0; i < conf->nfiles; ++i) {
+ if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
+ sam_close(conf->mplp_data[i]->fp);
+ if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
+ free(conf->mplp_data[i]);
+ }
+ if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
+ free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
+ free(mp_ref.ref[0]);
+ free(mp_ref.ref[1]);
+ return 0;
+}
+
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
+#define MAX_PATH_LEN 1024
+int read_file_list(const char *file_list,int *n,char **argv[])
+{
+ char buf[MAX_PATH_LEN];
+ int len, nfiles = 0;
+ char **files = NULL;
+ struct stat sb;
+
+ *n = 0;
+ *argv = NULL;
+
+ FILE *fh = fopen(file_list,"r");
+ if ( !fh )
+ {
+ fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ files = (char**) calloc(nfiles,sizeof(char*));
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) )
+ {
+ // allow empty lines and trailing spaces
+ len = strlen(buf);
+ while ( len>0 && isspace(buf[len-1]) ) len--;
+ if ( !len ) continue;
+
+ // check sanity of the file list
+ buf[len] = 0;
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
+ {
+ // no such file, check if it is safe to print its name
+ int i, safe_to_print = 1;
+ for (i=0; i<len; i++)
+ if (!isprint(buf[i])) { safe_to_print = 0; break; }
+ if ( safe_to_print )
+ fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ else
+ fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ return 1;
+ }
+
+ nfiles++;
+ files = (char**) realloc(files,nfiles*sizeof(char*));
+ files[nfiles-1] = strdup(buf);
+ }
+ fclose(fh);
+ if ( !nfiles )
+ {
+ fprintf(stderr,"No files read from %s\n", file_list);
+ return 1;
+ }
+ *argv = files;
+ *n = nfiles;
+ return 0;
+}
+#undef MAX_PATH_LEN
+
+int parse_format_flag(const char *str)
+{
+ int i, flag = 0, n_tags;
+ char **tags = hts_readlist(str, 0, &n_tags);
+ for(i=0; i<n_tags; i++)
+ {
+ if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
+ else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
+ else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
+ else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
+ else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
+ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
+ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
+ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+ else
+ {
+ fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
+ exit(EXIT_FAILURE);
+ }
+ free(tags[i]);
+ }
+ if (n_tags) free(tags);
+ return flag;
+}
+
+static void list_annotations(FILE *fp)
+{
+ fprintf(fp,
+"\n"
+"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
+"\n"
+" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+"\n"
+"INFO annotation tags available:\n"
+"\n"
+" INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
+" INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+"\n");
+}
+
+static void print_usage(FILE *fp, const mplp_conf_t *mplp)
+{
+ char *tmp_require = bam_flag2str(mplp->rflag_require);
+ char *tmp_filter = bam_flag2str(mplp->rflag_filter);
+
+ // Display usage information, formatted for the standard 80 columns.
+ // (The unusual string formatting here aids the readability of this
+ // source code in 80 columns, to the extent that's possible.)
+
+ fprintf(fp,
+"\n"
+"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+"\n"
+"Input options:\n"
+" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
+" -A, --count-orphans do not discard anomalous read pairs\n"
+" -b, --bam-list FILE list of input BAM filenames, one per line\n"
+" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
+" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
+" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ fprintf(fp,
+" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
+" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
+" --no-reference do not require fasta reference file\n"
+" -G, --read-groups FILE select or exclude read groups listed in the file\n"
+" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ fprintf(fp,
+" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ fprintf(fp,
+" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+" -R, --regions-file FILE restrict to regions listed in a file\n"
+" --ignore-RG ignore RG tags (one BAM = one sample)\n"
+" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ fprintf(fp,
+" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+" [%s]\n", tmp_filter);
+ fprintf(fp,
+" -s, --samples LIST comma separated list of samples to include\n"
+" -S, --samples-file FILE file of samples to include\n"
+" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+" -x, --ignore-overlaps disable read-pair overlap detection\n"
+"\n"
+"Output options:\n"
+" -a, --annotate LIST optional tags to output; '?' to list []\n"
+" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
+" to minimum per-sample DP\n"
+" --no-version do not append version and command line to the header\n"
+" -o, --output FILE write output to FILE [standard output]\n"
+" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
+" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+" --threads INT number of extra output compression threads [0]\n"
+"\n"
+"SNP/INDEL genotype likelihoods options:\n"
+" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+ fprintf(fp,
+" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ fprintf(fp,
+" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ fprintf(fp,
+" -I, --skip-indels do not perform indel calling\n"
+" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ fprintf(fp,
+" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ fprintf(fp,
+" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ fprintf(fp,
+" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
+" -P, --platforms STR comma separated list of platforms for indels [all]\n"
+"\n"
+"Notes: Assuming diploid individuals.\n"
+"\n");
+
+ free(tmp_require);
+ free(tmp_filter);
+}
+
+int bam_mpileup(int argc, char *argv[])
+{
+ int c;
+ const char *file_list = NULL;
+ char **fn = NULL;
+ int nfiles = 0, use_orphan = 0, noref = 0;
+ mplp_conf_t mplp;
+ memset(&mplp, 0, sizeof(mplp_conf_t));
+ mplp.min_baseQ = 13;
+ mplp.capQ_thres = 0;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
+ mplp.min_frac = 0.002; mplp.min_support = 1;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+ mplp.argc = argc; mplp.argv = argv;
+ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
+ mplp.output_fname = NULL;
+ mplp.output_type = FT_VCF;
+ mplp.record_cmd_line = 1;
+ mplp.n_threads = 0;
+ mplp.bsmpl = bam_smpl_init();
+
+ static const struct option lopts[] =
+ {
+ {"rf", required_argument, NULL, 1}, // require flag
+ {"ff", required_argument, NULL, 2}, // filter flag
+ {"incl-flags", required_argument, NULL, 1},
+ {"excl-flags", required_argument, NULL, 2},
+ {"output", required_argument, NULL, 3},
+ {"open-prob", required_argument, NULL, 4},
+ {"ignore-RG", no_argument, NULL, 5},
+ {"ignore-rg", no_argument, NULL, 5},
+ {"gvcf", required_argument, NULL, 'g'},
+ {"non-reference", no_argument, NULL, 7},
+ {"no-version", no_argument, NULL, 8},
+ {"threads",required_argument,NULL,9},
+ {"illumina1.3+", no_argument, NULL, '6'},
+ {"count-orphans", no_argument, NULL, 'A'},
+ {"bam-list", required_argument, NULL, 'b'},
+ {"no-BAQ", no_argument, NULL, 'B'},
+ {"no-baq", no_argument, NULL, 'B'},
+ {"adjust-MQ", required_argument, NULL, 'C'},
+ {"adjust-mq", required_argument, NULL, 'C'},
+ {"max-depth", required_argument, NULL, 'd'},
+ {"redo-BAQ", no_argument, NULL, 'E'},
+ {"redo-baq", no_argument, NULL, 'E'},
+ {"fasta-ref", required_argument, NULL, 'f'},
+ {"read-groups", required_argument, NULL, 'G'},
+ {"region", required_argument, NULL, 'r'},
+ {"regions", required_argument, NULL, 'r'},
+ {"regions-file", required_argument, NULL, 'R'},
+ {"targets", required_argument, NULL, 't'},
+ {"targets-file", required_argument, NULL, 'T'},
+ {"min-MQ", required_argument, NULL, 'q'},
+ {"min-mq", required_argument, NULL, 'q'},
+ {"min-BQ", required_argument, NULL, 'Q'},
+ {"min-bq", required_argument, NULL, 'Q'},
+ {"ignore-overlaps", no_argument, NULL, 'x'},
+ {"output-type", required_argument, NULL, 'O'},
+ {"samples", required_argument, NULL, 's'},
+ {"samples-file", required_argument, NULL, 'S'},
+ {"annotate", required_argument, NULL, 'a'},
+ {"ext-prob", required_argument, NULL, 'e'},
+ {"gap-frac", required_argument, NULL, 'F'},
+ {"tandem-qual", required_argument, NULL, 'h'},
+ {"skip-indels", no_argument, NULL, 'I'},
+ {"max-idepth", required_argument, NULL, 'L'},
+ {"min-ireads ", required_argument, NULL, 'm'},
+ {"per-sample-mF", no_argument, NULL, 'p'},
+ {"per-sample-mf", no_argument, NULL, 'p'},
+ {"platforms", required_argument, NULL, 'P'},
+ {NULL, 0, NULL, 0}
+ };
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+ switch (c) {
+ case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
+ case 1 :
+ mplp.rflag_require = bam_str2flag(optarg);
+ if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; }
+ break;
+ case 2 :
+ mplp.rflag_filter = bam_str2flag(optarg);
+ if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; }
+ break;
+ case 3 : mplp.output_fname = optarg; break;
+ case 4 : mplp.openQ = atoi(optarg); break;
+ case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break;
+ case 'g':
+ mplp.gvcf = gvcf_init(optarg);
+ if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
+ break;
+ case 'f':
+ mplp.fai = fai_load(optarg);
+ if (mplp.fai == NULL) return 1;
+ mplp.fai_fname = optarg;
+ break;
+ case 7 : noref = 1; break;
+ case 8 : mplp.record_cmd_line = 0; break;
+ case 9 : mplp.n_threads = strtol(optarg, 0, 0); break;
+ case 'd': mplp.max_depth = atoi(optarg); break;
+ case 'r': mplp.reg_fname = strdup(optarg); break;
+ case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break;
+ case 't':
+ // In the original version the whole BAM was streamed which is inefficient
+ // with few BED intervals and big BAMs. Todo: devise a heuristic to determine
+ // best strategy, that is streaming or jumping.
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL);
+ mplp.bed_itr = regitr_init(mplp.bed);
+ if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
+ {
+ fprintf(stderr,"Could not parse the targets: %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case 'T':
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
+ if (!mplp.bed) { fprintf(stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
+ break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
+ case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'I': mplp.flag |= MPLP_NO_INDEL; break;
+ case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
+ case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': mplp.output_type = FT_BCF_GZ; break;
+ case 'u': mplp.output_type = FT_BCF; break;
+ case 'z': mplp.output_type = FT_VCF_GZ; break;
+ case 'v': mplp.output_type = FT_VCF; break;
+ default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n", optarg);
+ }
+ break;
+ case 'C': mplp.capQ_thres = atoi(optarg); break;
+ case 'q': mplp.min_mq = atoi(optarg); break;
+ case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 'b': file_list = optarg; break;
+ case 'o': {
+ char *end;
+ long value = strtol(optarg, &end, 10);
+ // Distinguish between -o INT and -o FILE (a bit of a hack!)
+ if (*end == '\0') mplp.openQ = value;
+ else mplp.output_fname = optarg;
+ }
+ break;
+ case 'e': mplp.extQ = atoi(optarg); break;
+ case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 'A': use_orphan = 1; break;
+ case 'F': mplp.min_frac = atof(optarg); break;
+ case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
+ case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break;
+ case 'a':
+ if (optarg[0]=='?') {
+ list_annotations(stderr);
+ return 1;
+ }
+ mplp.fmt_flag |= parse_format_flag(optarg);
+ break;
+ default:
+ fprintf(stderr,"Invalid option: '%c'\n", c);
+ return 1;
+ }
+ }
+
+ if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) )
+ {
+ fprintf(stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n");
+ mplp.fmt_flag |= B2B_FMT_DP;
+ }
+ if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) )
+ {
+ if ( mplp.flag&MPLP_VCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF;
+ else mplp.output_type = FT_VCF_GZ;
+ }
+ else if ( mplp.flag&MPLP_BCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF;
+ else mplp.output_type = FT_BCF_GZ;
+ }
+ }
+ if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
+ {
+ fprintf(stderr,"Error: The -B option cannot be combined with -E\n");
+ return 1;
+ }
+ if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
+ if (argc == 1)
+ {
+ print_usage(stderr, &mplp);
+ return 1;
+ }
+ if (!mplp.fai && !noref) {
+ fprintf(stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
+ return 1;
+ }
+ int ret,i;
+ if (file_list)
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ mplp.files = fn;
+ mplp.nfiles = nfiles;
+ }
+ else
+ {
+ mplp.nfiles = argc - optind;
+ mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*));
+ for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]);
+ }
+ ret = mpileup(&mplp);
+
+ for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]);
+ free(mplp.files);
+ free(mplp.reg_fname); free(mplp.pl_list);
+ if (mplp.fai) fai_destroy(mplp.fai);
+ if (mplp.bed)
+ {
+ regidx_destroy(mplp.bed);
+ regitr_destroy(mplp.bed_itr);
+ }
+ if (mplp.reg) regidx_destroy(mplp.reg);
+ bam_smpl_destroy(mplp.bsmpl);
+ return ret;
+}
--- /dev/null
+#include "pysam.h"
+
+/* mpileup.c -- mpileup subcommand. Previously bam_plcmd.c from samtools
+
+ Copyright (C) 2008-2017 Genome Research Ltd.
+ Portions copyright (C) 2009-2012 Broad Institute.
+
+ Author: Heng Li <lh3@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <string.h>
+#include <strings.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <getopt.h>
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+#include <htslib/kstring.h>
+#include <htslib/khash_str2int.h>
+#include <assert.h>
+#include "regidx.h"
+#include "bcftools.h"
+#include "bam2bcf.h"
+#include "bam_sample.h"
+#include "gvcf.h"
+
+#define MPLP_BCF 1
+#define MPLP_VCF (1<<1)
+#define MPLP_NO_COMP (1<<2)
+#define MPLP_NO_ORPHAN (1<<3)
+#define MPLP_REALN (1<<4)
+#define MPLP_NO_INDEL (1<<5)
+#define MPLP_REDO_BAQ (1<<6)
+#define MPLP_ILLUMINA13 (1<<7)
+#define MPLP_IGNORE_RG (1<<8)
+#define MPLP_PRINT_POS (1<<9)
+#define MPLP_PRINT_MAPQ (1<<10)
+#define MPLP_PER_SAMPLE (1<<11)
+#define MPLP_SMART_OVERLAPS (1<<12)
+
+typedef struct _mplp_aux_t mplp_aux_t;
+typedef struct _mplp_pileup_t mplp_pileup_t;
+
+// Data shared by all bam files
+typedef struct {
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int rflag_require, rflag_filter, output_type;
+ int openQ, extQ, tandemQ, min_support; // for indels
+ double min_frac; // for indels
+ char *reg_fname, *pl_list, *fai_fname, *output_fname;
+ int reg_is_file, record_cmd_line, n_threads;
+ faidx_t *fai;
+ regidx_t *bed, *reg; // bed: skipping regions, reg: index-jump to regions
+ regitr_t *bed_itr, *reg_itr;
+ int bed_logic; // 1: include region, 0: exclude region
+ gvcf_t *gvcf;
+
+ // auxiliary structures for calling
+ bcf_callaux_t *bca;
+ bcf_callret1_t *bcr;
+ bcf_call_t bc;
+ bam_mplp_t iter;
+ mplp_aux_t **mplp_data;
+ int nfiles;
+ char **files;
+ mplp_pileup_t *gplp;
+ int *n_plp;
+ const bam_pileup1_t **plp;
+ bam_smpl_t *bsmpl;
+ kstring_t buf;
+ bcf1_t *bcf_rec;
+ htsFile *bcf_fp;
+ bcf_hdr_t *bcf_hdr;
+ int argc;
+ char **argv;
+} mplp_conf_t;
+
+typedef struct {
+ char *ref[2];
+ int ref_id[2];
+ int ref_len[2];
+} mplp_ref_t;
+
+#define MPLP_REF_INIT {{NULL,NULL},{-1,-1},{0,0}}
+
+// Data specific to each bam file
+struct _mplp_aux_t {
+ samFile *fp;
+ hts_itr_t *iter;
+ bam_hdr_t *h;
+ mplp_ref_t *ref;
+ const mplp_conf_t *conf;
+ int bam_id;
+ hts_idx_t *idx; // maintained only with more than one -r regions
+};
+
+// Data passed to htslib/mpileup
+struct _mplp_pileup_t {
+ int n;
+ int *n_plp, *m_plp;
+ bam_pileup1_t **plp;
+};
+
+static int mplp_get_ref(mplp_aux_t *ma, int tid, char **ref, int *ref_len) {
+ mplp_ref_t *r = ma->ref;
+
+ //printf("get ref %d {%d/%p, %d/%p}\n", tid, r->ref_id[0], r->ref[0], r->ref_id[1], r->ref[1]);
+
+ if (!r || !ma->conf->fai) {
+ *ref = NULL;
+ return 0;
+ }
+
+ // Do we need to reference count this so multiple mplp_aux_t can
+ // track which references are in use?
+ // For now we just cache the last two. Sufficient?
+ if (tid == r->ref_id[0]) {
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+ if (tid == r->ref_id[1]) {
+ // Last, swap over
+ int tmp;
+ tmp = r->ref_id[0]; r->ref_id[0] = r->ref_id[1]; r->ref_id[1] = tmp;
+ tmp = r->ref_len[0]; r->ref_len[0] = r->ref_len[1]; r->ref_len[1] = tmp;
+
+ char *tc;
+ tc = r->ref[0]; r->ref[0] = r->ref[1]; r->ref[1] = tc;
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+ }
+
+ // New, so migrate to old and load new
+ free(r->ref[1]);
+ r->ref[1] = r->ref[0];
+ r->ref_id[1] = r->ref_id[0];
+ r->ref_len[1] = r->ref_len[0];
+
+ r->ref_id[0] = tid;
+ r->ref[0] = faidx_fetch_seq(ma->conf->fai,
+ ma->h->target_name[r->ref_id[0]],
+ 0,
+ INT_MAX,
+ &r->ref_len[0]);
+
+ if (!r->ref[0]) {
+ r->ref[0] = NULL;
+ r->ref_id[0] = -1;
+ r->ref_len[0] = 0;
+ *ref = NULL;
+ return 0;
+ }
+
+ *ref = r->ref[0];
+ *ref_len = r->ref_len[0];
+ return 1;
+}
+
+static int mplp_func(void *data, bam1_t *b)
+{
+ char *ref;
+ mplp_aux_t *ma = (mplp_aux_t*)data;
+ int ret, ref_len;
+ while (1)
+ {
+ int has_ref;
+ ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b);
+ if (ret < 0) break;
+ // The 'B' cigar operation is not part of the specification, considering as obsolete.
+ // bam_remove_B(b);
+ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads
+ if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue;
+ if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue;
+ if (ma->conf->bed)
+ {
+ // test overlap
+ regitr_t *itr = ma->conf->bed_itr;
+ int beg = b->core.pos, end = bam_endpos(b)-1;
+ int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr);
+ if ( !ma->conf->bed_logic && !overlap )
+ {
+ // exclude only reads which are fully contained in the region
+ while ( regitr_overlap(itr) )
+ {
+ if ( beg < itr->beg ) { overlap = 1; break; }
+ if ( end > itr->end ) { overlap = 1; break; }
+ }
+ }
+ if ( !overlap ) continue;
+ }
+ if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue;
+ if (ma->conf->flag & MPLP_ILLUMINA13) {
+ int i;
+ uint8_t *qual = bam_get_qual(b);
+ for (i = 0; i < b->core.l_qseq; ++i)
+ qual[i] = qual[i] > 31? qual[i] - 31 : 0;
+ }
+
+ if (ma->conf->fai && b->core.tid >= 0) {
+ has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len);
+ if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence
+ fprintf(pysam_stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n",
+ __func__, b->core.pos, ref_len, b->core.tid);
+ continue;
+ }
+ } else {
+ has_ref = 0;
+ }
+
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && ma->conf->capQ_thres > 10) {
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
+ if (q < 0) continue; // skip
+ else if (b->core.qual > q) b->core.qual = q;
+ }
+ if (b->core.qual < ma->conf->min_mq) continue;
+ else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue;
+
+ return ret;
+ };
+ return ret;
+}
+
+// Called once per new bam added to the pileup.
+// We cache sample information here so we don't have to keep recomputing this
+// on each and every pileup column.
+//
+// Cd is an arbitrary block of data we can write into, which ends up in
+// the pileup structures. We stash the sample ID there.
+static int pileup_constructor(void *data, const bam1_t *b, bam_pileup_cd *cd) {
+ mplp_aux_t *ma = (mplp_aux_t *)data;
+ cd->i = bam_smpl_get_sample_id(ma->conf->bsmpl, ma->bam_id, (bam1_t *)b);
+ return 0;
+}
+
+static void group_smpl(mplp_pileup_t *m, bam_smpl_t *bsmpl, int n, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, j;
+ memset(m->n_plp, 0, m->n * sizeof(int));
+ for (i = 0; i < n; ++i) // iterate over all bams
+ {
+ for (j = 0; j < n_plp[i]; ++j) // iterate over all reads available at this position
+ {
+ const bam_pileup1_t *p = plp[i] + j;
+ int id = p->cd.i;
+ if (m->n_plp[id] == m->m_plp[id])
+ {
+ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
+ m->plp[id] = (bam_pileup1_t*) realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
+ }
+ m->plp[id][m->n_plp[id]++] = *p;
+ }
+ }
+}
+
+static void flush_bcf_records(mplp_conf_t *conf, htsFile *fp, bcf_hdr_t *hdr, bcf1_t *rec)
+{
+ if ( !conf->gvcf )
+ {
+ if ( rec ) bcf_write1(fp, hdr, rec);
+ return;
+ }
+
+ if ( !rec )
+ {
+ gvcf_write(conf->gvcf, fp, hdr, NULL, 0);
+ return;
+ }
+
+ int is_ref = 0;
+ if ( rec->n_allele==1 ) is_ref = 1;
+ else if ( rec->n_allele==2 )
+ {
+ // second allele is mpileup's X, not a variant
+ if ( rec->d.allele[1][0]=='<' && rec->d.allele[1][1]=='*' && rec->d.allele[1][2]=='>' ) is_ref = 1;
+ }
+ rec = gvcf_write(conf->gvcf, fp, hdr, rec, is_ref);
+ if ( rec ) bcf_write1(fp,hdr,rec);
+}
+
+static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end)
+{
+ bam_hdr_t *hdr = conf->mplp_data[0]->h; // header of first file in input list
+
+ int ret, i, tid, pos, ref_len;
+ char *ref;
+
+ while ( (ret=bam_mplp_auto(conf->iter, &tid, &pos, conf->n_plp, conf->plp)) > 0)
+ {
+ if ( end && (pos<beg || pos>end) ) continue;
+ if ( conf->bed && tid >= 0 )
+ {
+ int overlap = regidx_overlap(conf->bed, hdr->target_name[tid], pos, pos, NULL);
+ if ( !conf->bed_logic ) overlap = overlap ? 0 : 1;
+ if ( !overlap ) continue;
+ }
+ mplp_get_ref(conf->mplp_data[0], tid, &ref, &ref_len);
+
+ int total_depth, _ref0, ref16;
+ for (i = total_depth = 0; i < conf->nfiles; ++i) total_depth += conf->n_plp[i];
+ group_smpl(conf->gplp, conf->bsmpl, conf->nfiles, conf->n_plp, conf->plp);
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = seq_nt16_table[_ref0];
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], ref16, conf->bca, conf->bcr + i);
+ conf->bc.tid = tid; conf->bc.pos = pos;
+ bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, ref16, &conf->bc);
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, 0, 0);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+
+ // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
+ // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them
+ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth
+ && bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)
+ {
+ bcf_callaux_clean(conf->bca, &conf->bc);
+ for (i = 0; i < conf->gplp->n; ++i)
+ bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);
+ if (bcf_call_combine(conf->gplp->n, conf->bcr, conf->bca, -1, &conf->bc) >= 0)
+ {
+ bcf_clear1(conf->bcf_rec);
+ bcf_call2bcf(&conf->bc, conf->bcf_rec, conf->bcr, conf->fmt_flag, conf->bca, ref);
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, conf->bcf_rec);
+ }
+ }
+ }
+ return 0;
+}
+
+static int mpileup(mplp_conf_t *conf)
+{
+ if (conf->nfiles == 0) {
+ fprintf(pysam_stderr,"[%s] no input file/data given\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ mplp_ref_t mp_ref = MPLP_REF_INIT;
+ conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
+ conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
+ conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
+ conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));
+
+ // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
+ // must be kept in the memory for the whole time which can be a problem with many bams.
+ // Therefore if none or only one region is requested, we initialize the bam iterator as
+ // before and free the index. Only when multiple regions are queried, we keep the index.
+ int nregs = 0;
+ if ( conf->reg_fname )
+ {
+ if ( conf->reg_is_file )
+ {
+ conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
+ if ( !conf->reg ) {
+ fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ else
+ {
+ conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
+ fprintf(pysam_stderr,"Could not parse the regions: %s\n", conf->reg_fname);
+ exit(EXIT_FAILURE);
+ }
+ }
+ nregs = regidx_nregs(conf->reg);
+ conf->reg_itr = regitr_init(conf->reg);
+ regitr_loop(conf->reg_itr); // region iterator now positioned at the first region
+ }
+
+ // read the header of each file in the list and initialize data
+ // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
+ bam_hdr_t *hdr = NULL; // header of first file in input list
+ int i;
+ for (i = 0; i < conf->nfiles; ++i) {
+ bam_hdr_t *h_tmp;
+ conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
+ conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
+ if ( !conf->mplp_data[i]->fp )
+ {
+ fprintf(pysam_stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
+ fprintf(pysam_stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
+ exit(EXIT_FAILURE);
+ }
+ if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
+ fprintf(pysam_stderr, "[%s] failed to process %s: %s\n",
+ __func__, conf->fai_fname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->conf = conf;
+ conf->mplp_data[i]->ref = &mp_ref;
+ h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
+ if ( !h_tmp ) {
+ fprintf(pysam_stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
+ conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
+ if ( conf->mplp_data[i]->bam_id<0 )
+ {
+ // no usable readgroups in this bam, it can be skipped
+ sam_close(conf->mplp_data[i]->fp);
+ free(conf->mplp_data[i]);
+ bam_hdr_destroy(h_tmp);
+ free(conf->files[i]);
+ if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
+ conf->nfiles--;
+ i--;
+ continue;
+ }
+ if (conf->reg) {
+ hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
+ if (idx == NULL) {
+ fprintf(pysam_stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ if ( nregs==1 ) // no need to keep the index in memory
+ hts_idx_destroy(idx);
+ else
+ conf->mplp_data[i]->idx = idx;
+ }
+
+ if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
+ else {
+ // FIXME: check consistency between h and h_tmp
+ bam_hdr_destroy(h_tmp);
+
+ // we store only the first file's header; it's (alleged to be)
+ // compatible with the i-th file's target_name lookup needs
+ conf->mplp_data[i]->h = hdr;
+ }
+ }
+ // allocate data storage proportionate to number of samples being studied sm->n
+ bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
+ conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
+ conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));
+
+ fprintf(pysam_stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
+ // write the VCF header
+ conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
+ if (conf->bcf_fp == NULL) {
+ fprintf(pysam_stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);
+
+ // BCF header creation
+ conf->bcf_hdr = bcf_hdr_init("w");
+ conf->buf.l = 0;
+
+ if (conf->record_cmd_line)
+ {
+ ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
+ for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
+ kputc('\n', &conf->buf);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ if (conf->fai_fname)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+
+ // Translate BAM @SQ tags to BCF ##contig tags
+ // todo: use/write new BAM header manipulation routines, fill also UR, M5
+ for (i=0; i<hdr->n_targets; i++)
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
+ bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
+ }
+ conf->buf.l = 0;
+
+ bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
+#if CDF_MWU_TESTS
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
+#endif
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
+ if ( conf->fmt_flag&B2B_FMT_DP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DV )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
+ if ( conf->fmt_flag&B2B_FMT_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_INFO_DPR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
+ if ( conf->fmt_flag&B2B_FMT_DP4 )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
+ if ( conf->fmt_flag&B2B_FMT_SP )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
+ if ( conf->fmt_flag&B2B_FMT_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
+ if ( conf->fmt_flag&B2B_FMT_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_FMT_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
+ if ( conf->fmt_flag&B2B_INFO_AD )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
+ if ( conf->fmt_flag&B2B_INFO_ADF )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
+ if ( conf->fmt_flag&B2B_INFO_ADR )
+ bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
+ if ( conf->gvcf )
+ gvcf_update_header(conf->gvcf, conf->bcf_hdr);
+
+ int nsmpl;
+ const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
+ for (i=0; i<nsmpl; i++)
+ bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
+ bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr);
+
+ conf->bca = bcf_call_init(-1., conf->min_baseQ);
+ conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
+ conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
+ conf->bca->min_frac = conf->min_frac;
+ conf->bca->min_support = conf->min_support;
+ conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
+
+ conf->bc.bcf_hdr = conf->bcf_hdr;
+ conf->bc.n = nsmpl;
+ conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
+ if (conf->fmt_flag)
+ {
+ assert( sizeof(float)==sizeof(int32_t) );
+ conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
+ conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
+ if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
+ {
+ // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
+ conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
+ for (i=0; i<nsmpl; i++)
+ {
+ conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
+ conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
+ }
+ }
+ }
+
+ // init mpileup
+ conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
+ if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
+ if ( (double)conf->max_depth * conf->nfiles > 1<<20)
+ fprintf(pysam_stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
+ if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
+ fprintf(pysam_stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
+ bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
+ conf->max_indel_depth = conf->max_indel_depth * nsmpl;
+ conf->bcf_rec = bcf_init1();
+ bam_mplp_constructor(conf->iter, pileup_constructor);
+
+ // Run mpileup for multiple regions
+ if ( nregs )
+ {
+ int ireg = 0;
+ do
+ {
+ // first region is already positioned
+ if ( ireg++ > 0 )
+ {
+ conf->buf.l = 0;
+ ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end);
+
+ for (i=0; i<conf->nfiles; i++)
+ {
+ hts_itr_destroy(conf->mplp_data[i]->iter);
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
+ if ( !conf->mplp_data[i]->iter )
+ {
+ conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
+ if ( conf->mplp_data[i]->iter ) {
+ fprintf(pysam_stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
+ exit(EXIT_FAILURE);
+ }
+ fprintf(pysam_stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
+ exit(EXIT_FAILURE);
+ }
+ bam_mplp_reset(conf->iter);
+ }
+ }
+ mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
+ }
+ while ( regitr_loop(conf->reg_itr) );
+ }
+ else
+ mpileup_reg(conf,0,0);
+
+ flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);
+
+ // clean up
+ free(conf->bc.tmp.s);
+ bcf_destroy1(conf->bcf_rec);
+ if (conf->bcf_fp)
+ {
+ hts_close(conf->bcf_fp);
+ bcf_hdr_destroy(conf->bcf_hdr);
+ bcf_call_destroy(conf->bca);
+ free(conf->bc.PL);
+ free(conf->bc.DP4);
+ free(conf->bc.ADR);
+ free(conf->bc.ADF);
+ free(conf->bc.fmt_arr);
+ free(conf->bcr);
+ }
+ if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
+ free(conf->buf.s);
+ for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
+ free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
+ bam_mplp_destroy(conf->iter);
+ bam_hdr_destroy(hdr);
+ for (i = 0; i < conf->nfiles; ++i) {
+ if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
+ sam_close(conf->mplp_data[i]->fp);
+ if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
+ free(conf->mplp_data[i]);
+ }
+ if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
+ free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
+ free(mp_ref.ref[0]);
+ free(mp_ref.ref[1]);
+ return 0;
+}
+
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
+#define MAX_PATH_LEN 1024
+int read_file_list(const char *file_list,int *n,char **argv[])
+{
+ char buf[MAX_PATH_LEN];
+ int len, nfiles = 0;
+ char **files = NULL;
+ struct stat sb;
+
+ *n = 0;
+ *argv = NULL;
+
+ FILE *fh = fopen(file_list,"r");
+ if ( !fh )
+ {
+ fprintf(pysam_stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ files = (char**) calloc(nfiles,sizeof(char*));
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) )
+ {
+ // allow empty lines and trailing spaces
+ len = strlen(buf);
+ while ( len>0 && isspace(buf[len-1]) ) len--;
+ if ( !len ) continue;
+
+ // check sanity of the file list
+ buf[len] = 0;
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
+ {
+ // no such file, check if it is safe to print its name
+ int i, safe_to_print = 1;
+ for (i=0; i<len; i++)
+ if (!isprint(buf[i])) { safe_to_print = 0; break; }
+ if ( safe_to_print )
+ fprintf(pysam_stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+ else
+ fprintf(pysam_stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+ return 1;
+ }
+
+ nfiles++;
+ files = (char**) realloc(files,nfiles*sizeof(char*));
+ files[nfiles-1] = strdup(buf);
+ }
+ fclose(fh);
+ if ( !nfiles )
+ {
+ fprintf(pysam_stderr,"No files read from %s\n", file_list);
+ return 1;
+ }
+ *argv = files;
+ *n = nfiles;
+ return 0;
+}
+#undef MAX_PATH_LEN
+
+int parse_format_flag(const char *str)
+{
+ int i, flag = 0, n_tags;
+ char **tags = hts_readlist(str, 0, &n_tags);
+ for(i=0; i<n_tags; i++)
+ {
+ if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
+ else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(pysam_stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
+ else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(pysam_stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(pysam_stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(pysam_stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
+ else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
+ else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
+ else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
+ else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
+ else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
+ else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
+ else
+ {
+ fprintf(pysam_stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
+ exit(EXIT_FAILURE);
+ }
+ free(tags[i]);
+ }
+ if (n_tags) free(tags);
+ return flag;
+}
+
+static void list_annotations(FILE *fp)
+{
+ fprintf(fp,
+"\n"
+"FORMAT annotation tags available (\"FORMAT/\" prefix is optional):\n"
+"\n"
+" FORMAT/AD .. Allelic depth (Number=R,Type=Integer)\n"
+" FORMAT/ADF .. Allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" FORMAT/ADR .. Allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+" FORMAT/DP .. Number of high-quality bases (Number=1,Type=Integer)\n"
+" FORMAT/SP .. Phred-scaled strand bias P-value (Number=1,Type=Integer)\n"
+"\n"
+"INFO annotation tags available:\n"
+"\n"
+" INFO/AD .. Total allelic depth (Number=R,Type=Integer)\n"
+" INFO/ADF .. Total allelic depths on the forward strand (Number=R,Type=Integer)\n"
+" INFO/ADR .. Total allelic depths on the reverse strand (Number=R,Type=Integer)\n"
+"\n");
+}
+
+static void print_usage(FILE *fp, const mplp_conf_t *mplp)
+{
+ char *tmp_require = bam_flag2str(mplp->rflag_require);
+ char *tmp_filter = bam_flag2str(mplp->rflag_filter);
+
+ // Display usage information, formatted for the standard 80 columns.
+ // (The unusual string formatting here aids the readability of this
+ // source code in 80 columns, to the extent that's possible.)
+
+ fprintf(fp,
+"\n"
+"Usage: bcftools mpileup [options] in1.bam [in2.bam [...]]\n"
+"\n"
+"Input options:\n"
+" -6, --illumina1.3+ quality is in the Illumina-1.3+ encoding\n"
+" -A, --count-orphans do not discard anomalous read pairs\n"
+" -b, --bam-list FILE list of input BAM filenames, one per line\n"
+" -B, --no-BAQ disable BAQ (per-Base Alignment Quality)\n"
+" -C, --adjust-MQ INT adjust mapping quality; recommended:50, disable:0 [0]\n"
+" -d, --max-depth INT max per-file depth; avoids excessive memory usage [%d]\n", mplp->max_depth);
+ fprintf(fp,
+" -E, --redo-BAQ recalculate BAQ on the fly, ignore existing BQs\n"
+" -f, --fasta-ref FILE faidx indexed reference sequence file\n"
+" --no-reference do not require fasta reference file\n"
+" -G, --read-groups FILE select or exclude read groups listed in the file\n"
+" -q, --min-MQ INT skip alignments with mapQ smaller than INT [%d]\n", mplp->min_mq);
+ fprintf(fp,
+" -Q, --min-BQ INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp->min_baseQ);
+ fprintf(fp,
+" -r, --regions REG[,...] comma separated list of regions in which pileup is generated\n"
+" -R, --regions-file FILE restrict to regions listed in a file\n"
+" --ignore-RG ignore RG tags (one BAM = one sample)\n"
+" --rf, --incl-flags STR|INT required flags: skip reads with mask bits unset [%s]\n", tmp_require);
+ fprintf(fp,
+" --ff, --excl-flags STR|INT filter flags: skip reads with mask bits set\n"
+" [%s]\n", tmp_filter);
+ fprintf(fp,
+" -s, --samples LIST comma separated list of samples to include\n"
+" -S, --samples-file FILE file of samples to include\n"
+" -t, --targets REG[,...] similar to -r but streams rather than index-jumps\n"
+" -T, --targets-file FILE similar to -R but streams rather than index-jumps\n"
+" -x, --ignore-overlaps disable read-pair overlap detection\n"
+"\n"
+"Output options:\n"
+" -a, --annotate LIST optional tags to output; '?' to list []\n"
+" -g, --gvcf INT[,...] group non-variant sites into gVCF blocks according\n"
+" to minimum per-sample DP\n"
+" --no-version do not append version and command line to the header\n"
+" -o, --output FILE write output to FILE [standard output]\n"
+" -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n"
+" 'z' compressed VCF; 'v' uncompressed VCF [v]\n"
+" --threads INT number of extra output compression threads [0]\n"
+"\n"
+"SNP/INDEL genotype likelihoods options:\n"
+" -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", mplp->extQ);
+ fprintf(fp,
+" -F, --gap-frac FLOAT minimum fraction of gapped reads [%g]\n", mplp->min_frac);
+ fprintf(fp,
+" -h, --tandem-qual INT coefficient for homopolymer errors [%d]\n", mplp->tandemQ);
+ fprintf(fp,
+" -I, --skip-indels do not perform indel calling\n"
+" -L, --max-idepth INT maximum per-file depth for INDEL calling [%d]\n", mplp->max_indel_depth);
+ fprintf(fp,
+" -m, --min-ireads INT minimum number gapped reads for indel candidates [%d]\n", mplp->min_support);
+ fprintf(fp,
+" -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", mplp->openQ);
+ fprintf(fp,
+" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
+" -P, --platforms STR comma separated list of platforms for indels [all]\n"
+"\n"
+"Notes: Assuming diploid individuals.\n"
+"\n");
+
+ free(tmp_require);
+ free(tmp_filter);
+}
+
+int bam_mpileup(int argc, char *argv[])
+{
+ int c;
+ const char *file_list = NULL;
+ char **fn = NULL;
+ int nfiles = 0, use_orphan = 0, noref = 0;
+ mplp_conf_t mplp;
+ memset(&mplp, 0, sizeof(mplp_conf_t));
+ mplp.min_baseQ = 13;
+ mplp.capQ_thres = 0;
+ mplp.max_depth = 250; mplp.max_indel_depth = 250;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
+ mplp.min_frac = 0.002; mplp.min_support = 1;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS;
+ mplp.argc = argc; mplp.argv = argv;
+ mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
+ mplp.output_fname = NULL;
+ mplp.output_type = FT_VCF;
+ mplp.record_cmd_line = 1;
+ mplp.n_threads = 0;
+ mplp.bsmpl = bam_smpl_init();
+
+ static const struct option lopts[] =
+ {
+ {"rf", required_argument, NULL, 1}, // require flag
+ {"ff", required_argument, NULL, 2}, // filter flag
+ {"incl-flags", required_argument, NULL, 1},
+ {"excl-flags", required_argument, NULL, 2},
+ {"output", required_argument, NULL, 3},
+ {"open-prob", required_argument, NULL, 4},
+ {"ignore-RG", no_argument, NULL, 5},
+ {"ignore-rg", no_argument, NULL, 5},
+ {"gvcf", required_argument, NULL, 'g'},
+ {"non-reference", no_argument, NULL, 7},
+ {"no-version", no_argument, NULL, 8},
+ {"threads",required_argument,NULL,9},
+ {"illumina1.3+", no_argument, NULL, '6'},
+ {"count-orphans", no_argument, NULL, 'A'},
+ {"bam-list", required_argument, NULL, 'b'},
+ {"no-BAQ", no_argument, NULL, 'B'},
+ {"no-baq", no_argument, NULL, 'B'},
+ {"adjust-MQ", required_argument, NULL, 'C'},
+ {"adjust-mq", required_argument, NULL, 'C'},
+ {"max-depth", required_argument, NULL, 'd'},
+ {"redo-BAQ", no_argument, NULL, 'E'},
+ {"redo-baq", no_argument, NULL, 'E'},
+ {"fasta-ref", required_argument, NULL, 'f'},
+ {"read-groups", required_argument, NULL, 'G'},
+ {"region", required_argument, NULL, 'r'},
+ {"regions", required_argument, NULL, 'r'},
+ {"regions-file", required_argument, NULL, 'R'},
+ {"targets", required_argument, NULL, 't'},
+ {"targets-file", required_argument, NULL, 'T'},
+ {"min-MQ", required_argument, NULL, 'q'},
+ {"min-mq", required_argument, NULL, 'q'},
+ {"min-BQ", required_argument, NULL, 'Q'},
+ {"min-bq", required_argument, NULL, 'Q'},
+ {"ignore-overlaps", no_argument, NULL, 'x'},
+ {"output-type", required_argument, NULL, 'O'},
+ {"samples", required_argument, NULL, 's'},
+ {"samples-file", required_argument, NULL, 'S'},
+ {"annotate", required_argument, NULL, 'a'},
+ {"ext-prob", required_argument, NULL, 'e'},
+ {"gap-frac", required_argument, NULL, 'F'},
+ {"tandem-qual", required_argument, NULL, 'h'},
+ {"skip-indels", no_argument, NULL, 'I'},
+ {"max-idepth", required_argument, NULL, 'L'},
+ {"min-ireads ", required_argument, NULL, 'm'},
+ {"per-sample-mF", no_argument, NULL, 'p'},
+ {"per-sample-mf", no_argument, NULL, 'p'},
+ {"platforms", required_argument, NULL, 'P'},
+ {NULL, 0, NULL, 0}
+ };
+ while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:Bd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:",lopts,NULL)) >= 0) {
+ switch (c) {
+ case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
+ case 1 :
+ mplp.rflag_require = bam_str2flag(optarg);
+ if ( mplp.rflag_require<0 ) { fprintf(pysam_stderr,"Could not parse --rf %s\n", optarg); return 1; }
+ break;
+ case 2 :
+ mplp.rflag_filter = bam_str2flag(optarg);
+ if ( mplp.rflag_filter<0 ) { fprintf(pysam_stderr,"Could not parse --ff %s\n", optarg); return 1; }
+ break;
+ case 3 : mplp.output_fname = optarg; break;
+ case 4 : mplp.openQ = atoi(optarg); break;
+ case 5 : bam_smpl_ignore_readgroups(mplp.bsmpl); break;
+ case 'g':
+ mplp.gvcf = gvcf_init(optarg);
+ if ( !mplp.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
+ break;
+ case 'f':
+ mplp.fai = fai_load(optarg);
+ if (mplp.fai == NULL) return 1;
+ mplp.fai_fname = optarg;
+ break;
+ case 7 : noref = 1; break;
+ case 8 : mplp.record_cmd_line = 0; break;
+ case 9 : mplp.n_threads = strtol(optarg, 0, 0); break;
+ case 'd': mplp.max_depth = atoi(optarg); break;
+ case 'r': mplp.reg_fname = strdup(optarg); break;
+ case 'R': mplp.reg_fname = strdup(optarg); mplp.reg_is_file = 1; break;
+ case 't':
+ // In the original version the whole BAM was streamed which is inefficient
+ // with few BED intervals and big BAMs. Todo: devise a heuristic to determine
+ // best strategy, that is streaming or jumping.
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(NULL,regidx_parse_reg,NULL,0,NULL);
+ mplp.bed_itr = regitr_init(mplp.bed);
+ if ( regidx_insert_list(mplp.bed,optarg,',') !=0 )
+ {
+ fprintf(pysam_stderr,"Could not parse the targets: %s\n", optarg);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case 'T':
+ if ( optarg[0]=='^' ) optarg++;
+ else mplp.bed_logic = 1;
+ mplp.bed = regidx_init(optarg,NULL,NULL,0,NULL);
+ if (!mplp.bed) { fprintf(pysam_stderr, "bcftools mpileup: Could not read file \"%s\"", optarg); return 1; }
+ break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
+ case 'B': mplp.flag &= ~MPLP_REALN; break;
+ case 'I': mplp.flag |= MPLP_NO_INDEL; break;
+ case 'E': mplp.flag |= MPLP_REDO_BAQ; break;
+ case '6': mplp.flag |= MPLP_ILLUMINA13; break;
+ case 's': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,0)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'S': if ( bam_smpl_add_samples(mplp.bsmpl,optarg,1)<0 ) error("Could not read samples: %s\n",optarg); break;
+ case 'O':
+ switch (optarg[0]) {
+ case 'b': mplp.output_type = FT_BCF_GZ; break;
+ case 'u': mplp.output_type = FT_BCF; break;
+ case 'z': mplp.output_type = FT_VCF_GZ; break;
+ case 'v': mplp.output_type = FT_VCF; break;
+ default: error("[error] The option \"-O\" changed meaning when mpileup moved to bcftools. Did you mean: \"bcftools mpileup --output-type\" or \"samtools mpileup --output-BP\"?\n", optarg);
+ }
+ break;
+ case 'C': mplp.capQ_thres = atoi(optarg); break;
+ case 'q': mplp.min_mq = atoi(optarg); break;
+ case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 'b': file_list = optarg; break;
+ case 'o': {
+ char *end;
+ long value = strtol(optarg, &end, 10);
+ // Distinguish between -o INT and -o FILE (a bit of a hack!)
+ if (*end == '\0') mplp.openQ = value;
+ else mplp.output_fname = optarg;
+ }
+ break;
+ case 'e': mplp.extQ = atoi(optarg); break;
+ case 'h': mplp.tandemQ = atoi(optarg); break;
+ case 'A': use_orphan = 1; break;
+ case 'F': mplp.min_frac = atof(optarg); break;
+ case 'm': mplp.min_support = atoi(optarg); break;
+ case 'L': mplp.max_indel_depth = atoi(optarg); break;
+ case 'G': bam_smpl_add_readgroups(mplp.bsmpl, optarg, 1); break;
+ case 'a':
+ if (optarg[0]=='?') {
+ list_annotations(pysam_stderr);
+ return 1;
+ }
+ mplp.fmt_flag |= parse_format_flag(optarg);
+ break;
+ default:
+ fprintf(pysam_stderr,"Invalid option: '%c'\n", c);
+ return 1;
+ }
+ }
+
+ if ( mplp.gvcf && !(mplp.fmt_flag&B2B_FMT_DP) )
+ {
+ fprintf(pysam_stderr,"[warning] The -t DP option is required with --gvcf, switching on.\n");
+ mplp.fmt_flag |= B2B_FMT_DP;
+ }
+ if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) )
+ {
+ if ( mplp.flag&MPLP_VCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF;
+ else mplp.output_type = FT_VCF_GZ;
+ }
+ else if ( mplp.flag&MPLP_BCF )
+ {
+ if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF;
+ else mplp.output_type = FT_BCF_GZ;
+ }
+ }
+ if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ )
+ {
+ fprintf(pysam_stderr,"Error: The -B option cannot be combined with -E\n");
+ return 1;
+ }
+ if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN;
+ if (argc == 1)
+ {
+ print_usage(pysam_stderr, &mplp);
+ return 1;
+ }
+ if (!mplp.fai && !noref) {
+ fprintf(pysam_stderr,"Error: mpileup requires the --fasta-ref option by default; use --no-reference to run without a fasta reference\n");
+ return 1;
+ }
+ int ret,i;
+ if (file_list)
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ mplp.files = fn;
+ mplp.nfiles = nfiles;
+ }
+ else
+ {
+ mplp.nfiles = argc - optind;
+ mplp.files = (char**) malloc(mplp.nfiles*sizeof(char*));
+ for (i=0; i<mplp.nfiles; i++) mplp.files[i] = strdup(argv[optind+i]);
+ }
+ ret = mpileup(&mplp);
+
+ for (i=0; i<mplp.nfiles; i++) free(mplp.files[i]);
+ free(mplp.files);
+ free(mplp.reg_fname); free(mplp.pl_list);
+ if (mplp.fai) fai_destroy(mplp.fai);
+ if (mplp.bed)
+ {
+ regidx_destroy(mplp.bed);
+ regitr_destroy(mplp.bed_itr);
+ }
+ if (mplp.reg) regidx_destroy(mplp.reg);
+ bam_smpl_destroy(mplp.bsmpl);
+ return ret;
+}
--- /dev/null
+/* mw.h -- a table of precomputed Mann Whitney coefficients (for bam2bcf.c)
+
+ The MIT License
+
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: James Bonfield <jkb@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+// Code to build this table is below
+#ifdef BUILD_MW
+#include <stdio.h>
+
+double mann_whitney_1947(int n, int m, int U)
+{
+ if (U<0) return 0;
+ if (n==0||m==0) return U==0 ? 1 : 0;
+ return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U);
+}
+
+int main(void) {
+ int i, j, k;
+ printf("static double mw[6][6][50] = // [2-7][2-7][0-49]\n{\n");
+ for (i = 2; i < 8; i++) {
+ printf(" {\n");
+ for (j = 2; j < 8; j++) {
+ printf(" {\n");
+ for (k = 0; k < 50; k++) {
+ printf(" %.17f,\n", mann_whitney_1947(i,j,k));
+ }
+ printf(" },\n");
+ }
+ printf(" },\n");
+ }
+ printf("};\n");
+ return 0;
+}
+#endif
+
+static double mw[6][6][50] = // [2-7][2-7][0-49]
+{
+ {
+ {
+ 0.16666666666666666,
+ 0.16666666666666666,
+ 0.33333333333333331,
+ 0.16666666666666666,
+ 0.16666666666666666,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.09999999999999999,
+ 0.09999999999999999,
+ 0.19999999999999998,
+ 0.20000000000000001,
+ 0.20000000000000001,
+ 0.10000000000000001,
+ 0.10000000000000001,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.06666666666666665,
+ 0.06666666666666665,
+ 0.13333333333333330,
+ 0.13333333333333333,
+ 0.20000000000000001,
+ 0.13333333333333333,
+ 0.13333333333333333,
+ 0.06666666666666667,
+ 0.06666666666666667,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.04761904761904761,
+ 0.04761904761904761,
+ 0.09523809523809522,
+ 0.09523809523809523,
+ 0.14285714285714288,
+ 0.14285714285714285,
+ 0.14285714285714285,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.04761904761904762,
+ 0.04761904761904762,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.07142857142857141,
+ 0.07142857142857142,
+ 0.10714285714285715,
+ 0.10714285714285714,
+ 0.14285714285714285,
+ 0.10714285714285715,
+ 0.10714285714285715,
+ 0.07142857142857144,
+ 0.07142857142857142,
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.02777777777777777,
+ 0.02777777777777777,
+ 0.05555555555555555,
+ 0.05555555555555555,
+ 0.08333333333333334,
+ 0.08333333333333333,
+ 0.11111111111111110,
+ 0.11111111111111113,
+ 0.11111111111111113,
+ 0.08333333333333334,
+ 0.08333333333333334,
+ 0.05555555555555556,
+ 0.05555555555555555,
+ 0.02777777777777778,
+ 0.02777777777777778,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.10000000000000001,
+ 0.10000000000000001,
+ 0.20000000000000001,
+ 0.20000000000000001,
+ 0.19999999999999998,
+ 0.09999999999999999,
+ 0.09999999999999999,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.05000000000000000,
+ 0.05000000000000000,
+ 0.10000000000000001,
+ 0.14999999999999999,
+ 0.14999999999999999,
+ 0.14999999999999999,
+ 0.14999999999999999,
+ 0.10000000000000001,
+ 0.05000000000000000,
+ 0.05000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.05714285714285714,
+ 0.08571428571428570,
+ 0.11428571428571427,
+ 0.11428571428571427,
+ 0.14285714285714282,
+ 0.11428571428571428,
+ 0.11428571428571428,
+ 0.08571428571428572,
+ 0.05714285714285714,
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.03571428571428571,
+ 0.05357142857142856,
+ 0.07142857142857142,
+ 0.08928571428571427,
+ 0.10714285714285711,
+ 0.10714285714285712,
+ 0.10714285714285714,
+ 0.10714285714285715,
+ 0.08928571428571427,
+ 0.07142857142857142,
+ 0.05357142857142857,
+ 0.03571428571428571,
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.02380952380952381,
+ 0.03571428571428571,
+ 0.04761904761904762,
+ 0.05952380952380951,
+ 0.08333333333333330,
+ 0.08333333333333331,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.05952380952380952,
+ 0.04761904761904762,
+ 0.03571428571428571,
+ 0.02380952380952381,
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.01666666666666666,
+ 0.02499999999999999,
+ 0.03333333333333333,
+ 0.04166666666666666,
+ 0.05833333333333331,
+ 0.06666666666666665,
+ 0.07499999999999998,
+ 0.08333333333333331,
+ 0.08333333333333331,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.07500000000000000,
+ 0.06666666666666667,
+ 0.05833333333333333,
+ 0.04166666666666666,
+ 0.03333333333333333,
+ 0.02500000000000000,
+ 0.01666666666666667,
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.06666666666666667,
+ 0.06666666666666667,
+ 0.13333333333333333,
+ 0.13333333333333333,
+ 0.20000000000000001,
+ 0.13333333333333333,
+ 0.13333333333333330,
+ 0.06666666666666665,
+ 0.06666666666666665,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.05714285714285714,
+ 0.08571428571428572,
+ 0.11428571428571428,
+ 0.11428571428571428,
+ 0.14285714285714282,
+ 0.11428571428571427,
+ 0.11428571428571427,
+ 0.08571428571428570,
+ 0.05714285714285714,
+ 0.02857142857142857,
+ 0.02857142857142857,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01428571428571429,
+ 0.01428571428571429,
+ 0.02857142857142857,
+ 0.04285714285714286,
+ 0.07142857142857142,
+ 0.07142857142857142,
+ 0.09999999999999998,
+ 0.09999999999999998,
+ 0.11428571428571427,
+ 0.09999999999999998,
+ 0.09999999999999998,
+ 0.07142857142857142,
+ 0.07142857142857142,
+ 0.04285714285714286,
+ 0.02857142857142857,
+ 0.01428571428571429,
+ 0.01428571428571429,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.01587301587301587,
+ 0.02380952380952381,
+ 0.03968253968253968,
+ 0.04761904761904762,
+ 0.06349206349206349,
+ 0.07142857142857142,
+ 0.08730158730158730,
+ 0.08730158730158730,
+ 0.09523809523809522,
+ 0.08730158730158728,
+ 0.08730158730158730,
+ 0.07142857142857142,
+ 0.06349206349206349,
+ 0.04761904761904761,
+ 0.03968253968253968,
+ 0.02380952380952381,
+ 0.01587301587301587,
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00952380952380952,
+ 0.01428571428571429,
+ 0.02380952380952381,
+ 0.02857142857142857,
+ 0.04285714285714286,
+ 0.04761904761904762,
+ 0.06190476190476190,
+ 0.06666666666666665,
+ 0.07619047619047617,
+ 0.07619047619047617,
+ 0.08571428571428569,
+ 0.07619047619047617,
+ 0.07619047619047620,
+ 0.06666666666666667,
+ 0.06190476190476191,
+ 0.04761904761904762,
+ 0.04285714285714286,
+ 0.02857142857142857,
+ 0.02380952380952381,
+ 0.01428571428571429,
+ 0.00952380952380952,
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00606060606060606,
+ 0.00909090909090909,
+ 0.01515151515151515,
+ 0.01818181818181818,
+ 0.02727272727272727,
+ 0.03333333333333333,
+ 0.04242424242424242,
+ 0.04848484848484847,
+ 0.05757575757575756,
+ 0.06060606060606059,
+ 0.06969696969696967,
+ 0.06969696969696967,
+ 0.07272727272727272,
+ 0.06969696969696969,
+ 0.06969696969696970,
+ 0.06060606060606059,
+ 0.05757575757575757,
+ 0.04848484848484848,
+ 0.04242424242424242,
+ 0.03333333333333333,
+ 0.02727272727272727,
+ 0.01818181818181818,
+ 0.01515151515151515,
+ 0.00909090909090909,
+ 0.00606060606060606,
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.04761904761904762,
+ 0.04761904761904762,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.14285714285714285,
+ 0.14285714285714285,
+ 0.14285714285714288,
+ 0.09523809523809523,
+ 0.09523809523809522,
+ 0.04761904761904761,
+ 0.04761904761904761,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.03571428571428571,
+ 0.05357142857142857,
+ 0.07142857142857142,
+ 0.08928571428571427,
+ 0.10714285714285715,
+ 0.10714285714285714,
+ 0.10714285714285712,
+ 0.10714285714285711,
+ 0.08928571428571427,
+ 0.07142857142857142,
+ 0.05357142857142856,
+ 0.03571428571428571,
+ 0.01785714285714286,
+ 0.01785714285714286,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.01587301587301587,
+ 0.02380952380952381,
+ 0.03968253968253968,
+ 0.04761904761904761,
+ 0.06349206349206349,
+ 0.07142857142857142,
+ 0.08730158730158730,
+ 0.08730158730158728,
+ 0.09523809523809522,
+ 0.08730158730158730,
+ 0.08730158730158730,
+ 0.07142857142857142,
+ 0.06349206349206349,
+ 0.04761904761904762,
+ 0.03968253968253968,
+ 0.02380952380952381,
+ 0.01587301587301587,
+ 0.00793650793650794,
+ 0.00793650793650794,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00396825396825397,
+ 0.00396825396825397,
+ 0.00793650793650794,
+ 0.01190476190476190,
+ 0.01984126984126984,
+ 0.02777777777777777,
+ 0.03571428571428571,
+ 0.04365079365079365,
+ 0.05555555555555555,
+ 0.06349206349206349,
+ 0.07142857142857142,
+ 0.07539682539682539,
+ 0.07936507936507936,
+ 0.07936507936507936,
+ 0.07539682539682539,
+ 0.07142857142857142,
+ 0.06349206349206349,
+ 0.05555555555555555,
+ 0.04365079365079365,
+ 0.03571428571428571,
+ 0.02777777777777777,
+ 0.01984126984126984,
+ 0.01190476190476190,
+ 0.00793650793650794,
+ 0.00396825396825397,
+ 0.00396825396825397,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00432900432900433,
+ 0.00649350649350649,
+ 0.01082251082251082,
+ 0.01515151515151515,
+ 0.02164502164502164,
+ 0.02597402597402597,
+ 0.03463203463203463,
+ 0.04112554112554112,
+ 0.04978354978354978,
+ 0.05411255411255411,
+ 0.06277056277056275,
+ 0.06493506493506493,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06493506493506492,
+ 0.06277056277056275,
+ 0.05411255411255410,
+ 0.04978354978354978,
+ 0.04112554112554112,
+ 0.03463203463203463,
+ 0.02597402597402597,
+ 0.02164502164502164,
+ 0.01515151515151515,
+ 0.01082251082251082,
+ 0.00649350649350649,
+ 0.00432900432900433,
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00252525252525253,
+ 0.00378787878787879,
+ 0.00631313131313131,
+ 0.00883838383838384,
+ 0.01262626262626262,
+ 0.01641414141414141,
+ 0.02146464646464646,
+ 0.02651515151515151,
+ 0.03282828282828283,
+ 0.03787878787878787,
+ 0.04419191919191919,
+ 0.04924242424242424,
+ 0.05429292929292929,
+ 0.05808080808080808,
+ 0.06060606060606059,
+ 0.06186868686868686,
+ 0.06186868686868686,
+ 0.06060606060606059,
+ 0.05808080808080807,
+ 0.05429292929292930,
+ 0.04924242424242424,
+ 0.04419191919191920,
+ 0.03787878787878787,
+ 0.03282828282828282,
+ 0.02651515151515152,
+ 0.02146464646464646,
+ 0.01641414141414142,
+ 0.01262626262626263,
+ 0.00883838383838384,
+ 0.00631313131313131,
+ 0.00378787878787879,
+ 0.00252525252525253,
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.07142857142857142,
+ 0.07142857142857144,
+ 0.10714285714285715,
+ 0.10714285714285715,
+ 0.14285714285714285,
+ 0.10714285714285714,
+ 0.10714285714285715,
+ 0.07142857142857142,
+ 0.07142857142857141,
+ 0.03571428571428571,
+ 0.03571428571428571,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.02380952380952381,
+ 0.03571428571428571,
+ 0.04761904761904762,
+ 0.05952380952380952,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.09523809523809523,
+ 0.08333333333333331,
+ 0.08333333333333330,
+ 0.05952380952380951,
+ 0.04761904761904762,
+ 0.03571428571428571,
+ 0.02380952380952381,
+ 0.01190476190476190,
+ 0.01190476190476190,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00952380952380952,
+ 0.01428571428571429,
+ 0.02380952380952381,
+ 0.02857142857142857,
+ 0.04285714285714286,
+ 0.04761904761904762,
+ 0.06190476190476191,
+ 0.06666666666666667,
+ 0.07619047619047620,
+ 0.07619047619047617,
+ 0.08571428571428569,
+ 0.07619047619047617,
+ 0.07619047619047617,
+ 0.06666666666666665,
+ 0.06190476190476190,
+ 0.04761904761904762,
+ 0.04285714285714286,
+ 0.02857142857142857,
+ 0.02380952380952381,
+ 0.01428571428571429,
+ 0.00952380952380952,
+ 0.00476190476190476,
+ 0.00476190476190476,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00432900432900433,
+ 0.00649350649350649,
+ 0.01082251082251082,
+ 0.01515151515151515,
+ 0.02164502164502164,
+ 0.02597402597402597,
+ 0.03463203463203463,
+ 0.04112554112554112,
+ 0.04978354978354978,
+ 0.05411255411255410,
+ 0.06277056277056275,
+ 0.06493506493506492,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06926406926406925,
+ 0.06493506493506493,
+ 0.06277056277056275,
+ 0.05411255411255411,
+ 0.04978354978354978,
+ 0.04112554112554112,
+ 0.03463203463203463,
+ 0.02597402597402597,
+ 0.02164502164502164,
+ 0.01515151515151515,
+ 0.01082251082251082,
+ 0.00649350649350649,
+ 0.00432900432900433,
+ 0.00216450216450216,
+ 0.00216450216450216,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00108225108225108,
+ 0.00108225108225108,
+ 0.00216450216450216,
+ 0.00324675324675325,
+ 0.00541125541125541,
+ 0.00757575757575758,
+ 0.01190476190476190,
+ 0.01406926406926407,
+ 0.01948051948051948,
+ 0.02380952380952381,
+ 0.03030303030303030,
+ 0.03463203463203463,
+ 0.04220779220779219,
+ 0.04545454545454544,
+ 0.05194805194805194,
+ 0.05519480519480519,
+ 0.05952380952380951,
+ 0.05952380952380952,
+ 0.06277056277056275,
+ 0.05952380952380952,
+ 0.05952380952380951,
+ 0.05519480519480519,
+ 0.05194805194805194,
+ 0.04545454545454544,
+ 0.04220779220779219,
+ 0.03463203463203463,
+ 0.03030303030303030,
+ 0.02380952380952381,
+ 0.01948051948051948,
+ 0.01406926406926407,
+ 0.01190476190476190,
+ 0.00757575757575758,
+ 0.00541125541125541,
+ 0.00324675324675325,
+ 0.00216450216450216,
+ 0.00108225108225108,
+ 0.00108225108225108,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00116550116550117,
+ 0.00174825174825175,
+ 0.00291375291375291,
+ 0.00407925407925408,
+ 0.00641025641025641,
+ 0.00815850815850816,
+ 0.01107226107226107,
+ 0.01398601398601398,
+ 0.01806526806526806,
+ 0.02156177156177156,
+ 0.02680652680652679,
+ 0.03030303030303030,
+ 0.03554778554778554,
+ 0.03962703962703962,
+ 0.04428904428904428,
+ 0.04720279720279720,
+ 0.05128205128205127,
+ 0.05244755244755244,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05244755244755243,
+ 0.05128205128205127,
+ 0.04720279720279720,
+ 0.04428904428904428,
+ 0.03962703962703962,
+ 0.03554778554778555,
+ 0.03030303030303030,
+ 0.02680652680652681,
+ 0.02156177156177156,
+ 0.01806526806526806,
+ 0.01398601398601399,
+ 0.01107226107226107,
+ 0.00815850815850816,
+ 0.00641025641025641,
+ 0.00407925407925408,
+ 0.00291375291375291,
+ 0.00174825174825175,
+ 0.00116550116550117,
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ },
+ {
+ {
+ 0.02777777777777778,
+ 0.02777777777777778,
+ 0.05555555555555555,
+ 0.05555555555555556,
+ 0.08333333333333334,
+ 0.08333333333333334,
+ 0.11111111111111113,
+ 0.11111111111111113,
+ 0.11111111111111110,
+ 0.08333333333333333,
+ 0.08333333333333334,
+ 0.05555555555555555,
+ 0.05555555555555555,
+ 0.02777777777777777,
+ 0.02777777777777777,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.01666666666666667,
+ 0.02500000000000000,
+ 0.03333333333333333,
+ 0.04166666666666666,
+ 0.05833333333333333,
+ 0.06666666666666667,
+ 0.07500000000000000,
+ 0.08333333333333333,
+ 0.08333333333333333,
+ 0.08333333333333331,
+ 0.08333333333333331,
+ 0.07499999999999998,
+ 0.06666666666666665,
+ 0.05833333333333331,
+ 0.04166666666666666,
+ 0.03333333333333333,
+ 0.02499999999999999,
+ 0.01666666666666666,
+ 0.00833333333333333,
+ 0.00833333333333333,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00606060606060606,
+ 0.00909090909090909,
+ 0.01515151515151515,
+ 0.01818181818181818,
+ 0.02727272727272727,
+ 0.03333333333333333,
+ 0.04242424242424242,
+ 0.04848484848484848,
+ 0.05757575757575757,
+ 0.06060606060606059,
+ 0.06969696969696970,
+ 0.06969696969696969,
+ 0.07272727272727272,
+ 0.06969696969696967,
+ 0.06969696969696967,
+ 0.06060606060606059,
+ 0.05757575757575756,
+ 0.04848484848484847,
+ 0.04242424242424242,
+ 0.03333333333333333,
+ 0.02727272727272727,
+ 0.01818181818181818,
+ 0.01515151515151515,
+ 0.00909090909090909,
+ 0.00606060606060606,
+ 0.00303030303030303,
+ 0.00303030303030303,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00252525252525253,
+ 0.00378787878787879,
+ 0.00631313131313131,
+ 0.00883838383838384,
+ 0.01262626262626263,
+ 0.01641414141414142,
+ 0.02146464646464646,
+ 0.02651515151515152,
+ 0.03282828282828282,
+ 0.03787878787878787,
+ 0.04419191919191920,
+ 0.04924242424242424,
+ 0.05429292929292930,
+ 0.05808080808080807,
+ 0.06060606060606059,
+ 0.06186868686868686,
+ 0.06186868686868686,
+ 0.06060606060606059,
+ 0.05808080808080808,
+ 0.05429292929292929,
+ 0.04924242424242424,
+ 0.04419191919191919,
+ 0.03787878787878787,
+ 0.03282828282828283,
+ 0.02651515151515151,
+ 0.02146464646464646,
+ 0.01641414141414141,
+ 0.01262626262626262,
+ 0.00883838383838384,
+ 0.00631313131313131,
+ 0.00378787878787879,
+ 0.00252525252525253,
+ 0.00126262626262626,
+ 0.00126262626262626,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00116550116550117,
+ 0.00174825174825175,
+ 0.00291375291375291,
+ 0.00407925407925408,
+ 0.00641025641025641,
+ 0.00815850815850816,
+ 0.01107226107226107,
+ 0.01398601398601399,
+ 0.01806526806526806,
+ 0.02156177156177156,
+ 0.02680652680652681,
+ 0.03030303030303030,
+ 0.03554778554778555,
+ 0.03962703962703962,
+ 0.04428904428904428,
+ 0.04720279720279720,
+ 0.05128205128205127,
+ 0.05244755244755243,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05477855477855477,
+ 0.05244755244755244,
+ 0.05128205128205127,
+ 0.04720279720279720,
+ 0.04428904428904428,
+ 0.03962703962703962,
+ 0.03554778554778554,
+ 0.03030303030303030,
+ 0.02680652680652679,
+ 0.02156177156177156,
+ 0.01806526806526806,
+ 0.01398601398601398,
+ 0.01107226107226107,
+ 0.00815850815850816,
+ 0.00641025641025641,
+ 0.00407925407925408,
+ 0.00291375291375291,
+ 0.00174825174825175,
+ 0.00116550116550117,
+ 0.00058275058275058,
+ 0.00058275058275058,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ 0.00000000000000000,
+ },
+ {
+ 0.00029137529137529,
+ 0.00029137529137529,
+ 0.00058275058275058,
+ 0.00087412587412587,
+ 0.00145687645687646,
+ 0.00203962703962704,
+ 0.00320512820512821,
+ 0.00437062937062937,
+ 0.00582750582750583,
+ 0.00757575757575758,
+ 0.00990675990675991,
+ 0.01223776223776224,
+ 0.01544289044289044,
+ 0.01835664335664336,
+ 0.02185314685314686,
+ 0.02534965034965035,
+ 0.02913752913752913,
+ 0.03263403263403263,
+ 0.03642191142191141,
+ 0.03962703962703962,
+ 0.04254079254079253,
+ 0.04516317016317015,
+ 0.04720279720279719,
+ 0.04836829836829836,
+ 0.04924242424242423,
+ 0.04924242424242423,
+ 0.04836829836829836,
+ 0.04720279720279719,
+ 0.04516317016317015,
+ 0.04254079254079253,
+ 0.03962703962703962,
+ 0.03642191142191141,
+ 0.03263403263403263,
+ 0.02913752913752913,
+ 0.02534965034965035,
+ 0.02185314685314686,
+ 0.01835664335664336,
+ 0.01544289044289044,
+ 0.01223776223776224,
+ 0.00990675990675991,
+ 0.00757575757575758,
+ 0.00582750582750583,
+ 0.00437062937062937,
+ 0.00320512820512821,
+ 0.00203962703962704,
+ 0.00145687645687646,
+ 0.00087412587412587,
+ 0.00058275058275058,
+ 0.00029137529137529,
+ 0.00029137529137529,
+ },
+ },
+};
-/*
+/*
Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
THE SOFTWARE.
*/
-#include <htslib/regidx.h>
#include <htslib/khash_str2int.h>
#include <htslib/kseq.h>
#include <htslib/hts.h>
int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
int *sex2dflt;
regidx_t *idx;
+ regitr_t *itr;
void *sex2id;
char **id2sex;
kstring_t tmp_str;
return ploidy->idx;
}
-int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
{
int i, ret;
ploidy_t *ploidy = (ploidy_t*) usr;
else
{
// Fill CHR,FROM,TO
- ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
if ( ret!=0 ) return ret;
}
ploidy_destroy(pld);
return NULL;
}
+ pld->itr = regitr_init(pld->idx);
_set_defaults(pld,dflt);
return pld;
}
pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+ pld->itr = regitr_init(pld->idx);
kstring_t tmp = {0,0,0};
const char *ss = str;
while ( *se && isspace(*se) ) se++;
ss = se;
}
- regidx_insert(pld->idx,NULL);
free(tmp.s);
_set_defaults(pld,dflt);
void ploidy_destroy(ploidy_t *ploidy)
{
if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
+ if ( ploidy->itr ) regitr_destroy(ploidy->itr);
if ( ploidy->idx ) regidx_destroy(ploidy->idx);
free(ploidy->id2sex);
free(ploidy->tmp_str.s);
int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
{
- regitr_t itr;
- int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr);
+ int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
if ( !sex2ploidy && !min && !max ) return ret;
int _min = INT_MAX, _max = -1;
if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
- while ( REGITR_OVERLAP(itr,pos,pos) )
+ while ( regitr_overlap(ploidy->itr) )
{
- int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex;
- int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy;
+ int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
+ int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
if ( pld!=ploidy->dflt )
{
if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
if ( _min > pld ) _min = pld;
if ( _max < pld ) _max = pld;
}
- itr.i++;
}
if ( _max==-1 ) _max = _min = ploidy->dflt;
if ( max ) *max = _max;
#include "pysam.h"
-/*
+/*
Copyright (C) 2014-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
THE SOFTWARE.
*/
-#include <htslib/regidx.h>
#include <htslib/khash_str2int.h>
#include <htslib/kseq.h>
#include <htslib/hts.h>
int dflt, min, max; // ploidy: default, min and max (only explicitly listed)
int *sex2dflt;
regidx_t *idx;
+ regitr_t *itr;
void *sex2id;
char **id2sex;
kstring_t tmp_str;
return ploidy->idx;
}
-int ploidy_parse(const char *line, char **chr_beg, char **chr_end, reg_t *reg, void *payload, void *usr)
+int ploidy_parse(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
{
int i, ret;
ploidy_t *ploidy = (ploidy_t*) usr;
else
{
// Fill CHR,FROM,TO
- ret = regidx_parse_tab(line,chr_beg,chr_end,reg,NULL,NULL);
+ ret = regidx_parse_tab(line,chr_beg,chr_end,beg,end,NULL,NULL);
if ( ret!=0 ) return ret;
}
ploidy_destroy(pld);
return NULL;
}
+ pld->itr = regitr_init(pld->idx);
_set_defaults(pld,dflt);
return pld;
}
pld->min = pld->max = -1;
pld->sex2id = khash_str2int_init();
pld->idx = regidx_init(NULL,ploidy_parse,NULL,sizeof(sex_ploidy_t),pld);
+ pld->itr = regitr_init(pld->idx);
kstring_t tmp = {0,0,0};
const char *ss = str;
while ( *se && isspace(*se) ) se++;
ss = se;
}
- regidx_insert(pld->idx,NULL);
free(tmp.s);
_set_defaults(pld,dflt);
void ploidy_destroy(ploidy_t *ploidy)
{
if ( ploidy->sex2id ) khash_str2int_destroy_free(ploidy->sex2id);
+ if ( ploidy->itr ) regitr_destroy(ploidy->itr);
if ( ploidy->idx ) regidx_destroy(ploidy->idx);
free(ploidy->id2sex);
free(ploidy->tmp_str.s);
int ploidy_query(ploidy_t *ploidy, char *seq, int pos, int *sex2ploidy, int *min, int *max)
{
- regitr_t itr;
- int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, &itr);
+ int i, ret = regidx_overlap(ploidy->idx, seq,pos,pos, ploidy->itr);
if ( !sex2ploidy && !min && !max ) return ret;
int _min = INT_MAX, _max = -1;
if ( sex2ploidy ) for (i=0; i<ploidy->nsex; i++) sex2ploidy[i] = ploidy->dflt;
- while ( REGITR_OVERLAP(itr,pos,pos) )
+ while ( regitr_overlap(ploidy->itr) )
{
- int sex = REGITR_PAYLOAD(itr,sex_ploidy_t).sex;
- int pld = REGITR_PAYLOAD(itr,sex_ploidy_t).ploidy;
+ int sex = regitr_payload(ploidy->itr,sex_ploidy_t).sex;
+ int pld = regitr_payload(ploidy->itr,sex_ploidy_t).ploidy;
if ( pld!=ploidy->dflt )
{
if ( sex2ploidy ) sex2ploidy[ sex ] = pld;
if ( _min > pld ) _min = pld;
if ( _max < pld ) _max = pld;
}
- itr.i++;
}
if ( _max==-1 ) _max = _min = ploidy->dflt;
if ( max ) *max = _max;
#ifndef __PLOIDY_H__
#define __PLOIDY_H__
-#include <htslib/regidx.h>
+#include "regidx.h"
typedef struct _ploidy_t ploidy_t;
static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
{
int i, j;
- long *p, tmp;
- p = (long*) alloca(b->n_allele * sizeof(long));
+ long p_a[16], *p=p_a, tmp;
+ if (b->n_allele > 16)
+ p = (long*) malloc(b->n_allele * sizeof(long));
memset(p, 0, sizeof(long) * b->n_allele);
// Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk
tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
for (i = b->n_allele - 1; i >= 0; --i)
if ((p[i]&0xf) == 0) break;
+ if (p != p_a)
+ free(p);
return i;
}
-/* f0 is minor allele fraction */
-int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+/* f0 is freq of the ref allele */
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var)
{
double sum, g[3];
double max, f3[3], *pdg = ma->pdg + k * 3;
g[i] /= sum;
if (g[i] > max) max = g[i], max_i = i;
}
+ if ( !is_var ) { max_i = 2; max = g[2]; } // force 0/0 genotype if the site is non-variant
max = 1. - max;
if (max < 1e-308) max = 1e-308;
q = (int)(-4.343 * log(max) + .499);
static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
{
int i, j;
- long *p, tmp;
- p = (long*) alloca(b->n_allele * sizeof(long));
+ long p_a[16], *p=p_a, tmp;
+ if (b->n_allele > 16)
+ p = (long*) malloc(b->n_allele * sizeof(long));
memset(p, 0, sizeof(long) * b->n_allele);
// Set P(D|g) for each sample and sum phread likelihoods across all samples to create lk
tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
for (i = b->n_allele - 1; i >= 0; --i)
if ((p[i]&0xf) == 0) break;
+ if (p != p_a)
+ free(p);
return i;
}
-/* f0 is minor allele fraction */
-int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+/* f0 is freq of the ref allele */
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var)
{
double sum, g[3];
double max, f3[3], *pdg = ma->pdg + k * 3;
g[i] /= sum;
if (g[i] > max) max = g[i], max_i = i;
}
+ if ( !is_var ) { max_i = 2; max = g[2]; } // force 0/0 genotype if the site is non-variant
max = 1. - max;
if (max < 1e-308) max = 1e-308;
q = (int)(-4.343 * log(max) + .499);
void bcf_p1_destroy(bcf_p1aux_t *ma);
void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma);
int bcf_p1_cal(call_t *call, bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
- int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
+ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k, int is_var);
void bcf_p1_dump_afs(bcf_p1aux_t *ma);
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
--- /dev/null
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <strings.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include "regidx.h"
+
+#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based
+
+#define iBIN(x) ((x)>>13)
+
+typedef struct
+{
+ uint32_t beg, end;
+}
+reg_t;
+
+typedef struct
+{
+ uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat
+}
+pos_t;
+
+typedef struct _reglist_t reglist_t;
+
+typedef struct
+{
+ uint32_t beg, end, ireg; // query coordinates and the active region
+ regidx_t *ridx;
+ reglist_t *list;
+ int active;
+}
+_itr_t;
+
+// List of regions for one chromosome.
+struct _reglist_t
+{
+ uint32_t *idx, nidx; // index to list.reg+1
+ uint32_t nreg, mreg; // n:used, m:allocated
+ reg_t *reg; // regions
+ void *dat; // payload data
+ char *seq; // sequence name
+ int unsorted;
+
+};
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+ int payload_size;
+ void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand)
+ kstring_t str;
+};
+
+int regidx_seq_nregs(regidx_t *idx, const char *seq)
+{
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
+ return idx->seq[iseq].nreg;
+}
+
+int regidx_nregs(regidx_t *idx)
+{
+ int i, nreg = 0;
+ for (i=0; i<idx->nseq; i++) nreg += idx->seq[i].nreg;
+ return nreg;
+}
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int regidx_insert_list(regidx_t *idx, char *line, char delim)
+{
+ kstring_t tmp = {0,0,0};
+ char *ss = line;
+ while ( *ss )
+ {
+ char *se = ss;
+ while ( *se && *se!=delim ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ if ( regidx_insert(idx,tmp.s) < 0 )
+ {
+ free(tmp.s);
+ return -1;
+ }
+ if ( !*se ) break;
+ ss = se+1;
+ }
+ free(tmp.s);
+ return 0;
+}
+
+static inline int cmp_regs(reg_t *a, reg_t *b)
+{
+ if ( a->beg < b->beg ) return -1;
+ if ( a->beg > b->beg ) return 1;
+ if ( a->end < b->end ) return 1; // longer intervals come first
+ if ( a->end > b->end ) return -1;
+ return 0;
+}
+static int cmp_reg_ptrs(const void *a, const void *b)
+{
+ return cmp_regs((reg_t*)a,(reg_t*)b);
+}
+static int cmp_reg_ptrs2(const void *a, const void *b)
+{
+ return cmp_regs(*((reg_t**)a),*((reg_t**)b));
+}
+
+inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload)
+{
+ if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
+ if ( end > MAX_COOR_0 ) end = MAX_COOR_0;
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ // new chromosome
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->seq = idx->seq_names[rid];
+ list->nreg++;
+ int mreg = list->mreg;
+ hts_expand(reg_t,list->nreg,list->mreg,list->reg);
+ list->reg[list->nreg-1].beg = beg;
+ list->reg[list->nreg-1].end = end;
+ if ( idx->payload_size )
+ {
+ if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg);
+ memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size);
+ }
+ if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1;
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line ) return 0;
+ char *chr_from, *chr_to;
+ uint32_t beg,end;
+ int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+ regidx_push(idx, chr_from,chr_to,beg,end,idx->payload);
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ if ( !parser )
+ {
+ if ( !fname ) parser = regidx_parse_tab;
+ else
+ {
+ int len = strlen(fname);
+ if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
+ parser = regidx_parse_bed;
+ else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
+ parser = regidx_parse_bed;
+ else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
+ parser = regidx_parse_bed;
+ else
+ parser = regidx_parse_tab;
+ }
+ }
+
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ kstring_t str = {0,0,0};
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) goto error;
+
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ if ( fp ) hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nreg; j++)
+ idx->free((char *)list->dat + idx->payload_size*j);
+ }
+ free(list->dat);
+ free(list->reg);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+{
+ int i;
+ if ( list->unsorted )
+ {
+ if ( !regidx->payload_size )
+ qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
+ else
+ {
+ reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
+ for (i=0; i<list->nreg; i++) ptr[i] = list->reg + i;
+ qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
+
+ void *tmp_dat = malloc(regidx->payload_size*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ memcpy((char *)tmp_dat+i*regidx->payload_size,
+ (char *)list->dat+iori*regidx->payload_size,
+ regidx->payload_size);
+ }
+ free(list->dat);
+ list->dat = tmp_dat;
+
+ reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ tmp_reg[i] = list->reg[iori];
+ }
+ free(ptr);
+ free(list->reg);
+ list->reg = tmp_reg;
+ list->mreg = list->nreg;
+ }
+ list->unsorted = 0;
+ }
+
+ list->nidx = 0;
+ int j,k, midx = 0;
+ for (j=0; j<list->nreg; j++)
+ {
+ int ibeg = iBIN(list->reg[j].beg);
+ int iend = iBIN(list->reg[j].end);
+ if ( midx <= iend )
+ {
+ int old_midx = midx;
+ midx = iend + 1;
+ kroundup32(midx);
+ list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t));
+ memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx));
+ }
+ if ( ibeg==iend )
+ {
+ if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( !list->idx[k] ) list->idx[k] = j + 1;
+ }
+ if ( list->nidx < iend+1 ) list->nidx = iend+1;
+ }
+
+ return 0;
+}
+
+int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr)
+{
+ if ( regitr ) regitr->seq = NULL;
+
+ int iseq, ireg;
+ if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = ®idx->seq[iseq];
+ if ( !list->nreg ) return 0;
+
+ if ( list->nreg==1 )
+ {
+ if ( beg > list->reg[0].end ) return 0;
+ if ( end < list->reg[0].beg ) return 0;
+ ireg = 0;
+ }
+ else
+ {
+ if ( !list->idx )
+ _reglist_build_index(regidx,list);
+
+ int ibeg = iBIN(beg);
+ if ( ibeg >= list->nidx ) return 0; // beg is too big
+
+ // find a matching region
+ uint32_t i = list->idx[ibeg];
+ if ( !i )
+ {
+ int iend = iBIN(end);
+ if ( iend > list->nidx ) iend = list->nidx;
+ for (i=ibeg; i<iend; i++)
+ if ( list->idx[i] ) break;
+ if ( i==iend ) return 0;
+ i = list->idx[i];
+ }
+
+ for (ireg=i-1; ireg<list->nreg; ireg++)
+ {
+ if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region
+ if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found
+ }
+
+ if ( ireg >= list->nreg ) return 0; // no match
+ }
+
+ if ( !regitr ) return 1; // match, but no more info to save
+
+ // may need to iterate over the matching regions later
+ _itr_t *itr = (_itr_t*)regitr->itr;
+ itr->ridx = regidx;
+ itr->list = list;
+ itr->beg = beg;
+ itr->end = end;
+ itr->ireg = ireg;
+ itr->active = 0;
+
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[ireg].beg;
+ regitr->end = list->reg[ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)list->dat + regidx->payload_size*ireg;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ *end = strtod(ss, &se) - 1;
+ if ( ss==se ) { fprintf(stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(stderr,"Could not parse tab line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && *se!=':' ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(stderr,"Could not parse reg line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = se[0]=='-' ? MAX_COOR_0 : *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+regitr_t *regitr_init(regidx_t *regidx)
+{
+ regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t));
+ regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t));
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ itr->ridx = regidx;
+ itr->list = NULL;
+ return regitr;
+}
+
+void regitr_reset(regidx_t *regidx, regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ memset(itr,0,sizeof(_itr_t));
+ itr->ridx = regidx;
+}
+
+void regitr_destroy(regitr_t *regitr)
+{
+ free(regitr->itr);
+ free(regitr);
+}
+
+int regitr_overlap(regitr_t *regitr)
+{
+ if ( !regitr->seq ) return 0;
+
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ if ( !itr->active )
+ {
+ // is this the first call after regidx_overlap?
+ itr->active = 1;
+ itr->ireg++;
+ return 1;
+ }
+
+ reglist_t *list = itr->list;
+
+ int i;
+ for (i=itr->ireg; i<list->nreg; i++)
+ {
+ if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region
+ if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found
+ }
+
+ if ( i >= list->nreg ) return 0; // no match
+
+ itr->ireg = i + 1;
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[i].beg;
+ regitr->end = list->reg[i].end;
+ if ( itr->ridx->payload_size )
+ regitr->payload = (char *)list->dat + itr->ridx->payload_size*i;
+
+ return 1;
+}
+
+int regitr_loop(regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ regidx_t *regidx = itr->ridx;
+
+ if ( !itr->list ) // first time here
+ {
+ itr->list = regidx->seq;
+ itr->ireg = 0;
+ }
+
+ size_t iseq = itr->list - regidx->seq;
+ if ( iseq >= regidx->nseq ) return 0;
+
+ if ( itr->ireg >= itr->list->nreg )
+ {
+ iseq++;
+ if ( iseq >= regidx->nseq ) return 0; // no more sequences, done
+ itr->ireg = 0;
+ itr->list = ®idx->seq[iseq];
+ }
+
+ regitr->seq = itr->list->seq;
+ regitr->beg = itr->list->reg[itr->ireg].beg;
+ regitr->end = itr->list->reg[itr->ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg;
+ itr->ireg++;
+
+ return 1;
+}
+
+
+
--- /dev/null
+#include "pysam.h"
+
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <strings.h>
+#include <htslib/hts.h>
+#include <htslib/kstring.h>
+#include <htslib/kseq.h>
+#include <htslib/khash_str2int.h>
+#include "regidx.h"
+
+#define MAX_COOR_0 REGIDX_MAX // CSI and hts_itr_query limit, 0-based
+
+#define iBIN(x) ((x)>>13)
+
+typedef struct
+{
+ uint32_t beg, end;
+}
+reg_t;
+
+typedef struct
+{
+ uint32_t pos, ireg; // y-coordinate and a pointer to reglist.reg and reglist.dat
+}
+pos_t;
+
+typedef struct _reglist_t reglist_t;
+
+typedef struct
+{
+ uint32_t beg, end, ireg; // query coordinates and the active region
+ regidx_t *ridx;
+ reglist_t *list;
+ int active;
+}
+_itr_t;
+
+// List of regions for one chromosome.
+struct _reglist_t
+{
+ uint32_t *idx, nidx; // index to list.reg+1
+ uint32_t nreg, mreg; // n:used, m:allocated
+ reg_t *reg; // regions
+ void *dat; // payload data
+ char *seq; // sequence name
+ int unsorted;
+
+};
+
+// Container of all sequences
+struct _regidx_t
+{
+ int nseq, mseq; // n:used, m:alloced
+ reglist_t *seq; // regions for each sequence
+ void *seq2regs; // hash for fast lookup from chr name to regions
+ char **seq_names;
+ regidx_free_f free; // function to free any data allocated by regidx_parse_f
+ regidx_parse_f parse; // parse one input line
+ void *usr; // user data to pass to regidx_parse_f
+ int payload_size;
+ void *payload; // temporary payload data set by regidx_parse_f (sequence is not known beforehand)
+ kstring_t str;
+};
+
+int regidx_seq_nregs(regidx_t *idx, const char *seq)
+{
+ int iseq;
+ if ( khash_str2int_get(idx->seq2regs, seq, &iseq)!=0 ) return 0; // no such sequence
+ return idx->seq[iseq].nreg;
+}
+
+int regidx_nregs(regidx_t *idx)
+{
+ int i, nreg = 0;
+ for (i=0; i<idx->nseq; i++) nreg += idx->seq[i].nreg;
+ return nreg;
+}
+
+char **regidx_seq_names(regidx_t *idx, int *n)
+{
+ *n = idx->nseq;
+ return idx->seq_names;
+}
+
+int regidx_insert_list(regidx_t *idx, char *line, char delim)
+{
+ kstring_t tmp = {0,0,0};
+ char *ss = line;
+ while ( *ss )
+ {
+ char *se = ss;
+ while ( *se && *se!=delim ) se++;
+ tmp.l = 0;
+ kputsn(ss, se-ss, &tmp);
+ if ( regidx_insert(idx,tmp.s) < 0 )
+ {
+ free(tmp.s);
+ return -1;
+ }
+ if ( !*se ) break;
+ ss = se+1;
+ }
+ free(tmp.s);
+ return 0;
+}
+
+static inline int cmp_regs(reg_t *a, reg_t *b)
+{
+ if ( a->beg < b->beg ) return -1;
+ if ( a->beg > b->beg ) return 1;
+ if ( a->end < b->end ) return 1; // longer intervals come first
+ if ( a->end > b->end ) return -1;
+ return 0;
+}
+static int cmp_reg_ptrs(const void *a, const void *b)
+{
+ return cmp_regs((reg_t*)a,(reg_t*)b);
+}
+static int cmp_reg_ptrs2(const void *a, const void *b)
+{
+ return cmp_regs(*((reg_t**)a),*((reg_t**)b));
+}
+
+inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload)
+{
+ if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
+ if ( end > MAX_COOR_0 ) end = MAX_COOR_0;
+
+ int rid;
+ idx->str.l = 0;
+ kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
+ if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
+ {
+ // new chromosome
+ idx->nseq++;
+ int m_prev = idx->mseq;
+ hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
+ hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
+ idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
+ rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
+ }
+
+ reglist_t *list = &idx->seq[rid];
+ list->seq = idx->seq_names[rid];
+ list->nreg++;
+ int mreg = list->mreg;
+ hts_expand(reg_t,list->nreg,list->mreg,list->reg);
+ list->reg[list->nreg-1].beg = beg;
+ list->reg[list->nreg-1].end = end;
+ if ( idx->payload_size )
+ {
+ if ( mreg != list->mreg ) list->dat = realloc(list->dat,idx->payload_size*list->mreg);
+ memcpy((char *)list->dat + idx->payload_size*(list->nreg-1), payload, idx->payload_size);
+ }
+ if ( !list->unsorted && list->nreg>1 && cmp_regs(&list->reg[list->nreg-2],&list->reg[list->nreg-1])>0 ) list->unsorted = 1;
+ return 0;
+}
+
+int regidx_insert(regidx_t *idx, char *line)
+{
+ if ( !line ) return 0;
+ char *chr_from, *chr_to;
+ uint32_t beg,end;
+ int ret = idx->parse(line,&chr_from,&chr_to,&beg,&end,idx->payload,idx->usr);
+ if ( ret==-2 ) return -1; // error
+ if ( ret==-1 ) return 0; // skip the line
+ regidx_push(idx, chr_from,chr_to,beg,end,idx->payload);
+ return 0;
+}
+
+regidx_t *regidx_init(const char *fname, regidx_parse_f parser, regidx_free_f free_f, size_t payload_size, void *usr_dat)
+{
+ if ( !parser )
+ {
+ if ( !fname ) parser = regidx_parse_tab;
+ else
+ {
+ int len = strlen(fname);
+ if ( len>=7 && !strcasecmp(".bed.gz",fname+len-7) )
+ parser = regidx_parse_bed;
+ else if ( len>=8 && !strcasecmp(".bed.bgz",fname+len-8) )
+ parser = regidx_parse_bed;
+ else if ( len>=4 && !strcasecmp(".bed",fname+len-4) )
+ parser = regidx_parse_bed;
+ else
+ parser = regidx_parse_tab;
+ }
+ }
+
+ regidx_t *idx = (regidx_t*) calloc(1,sizeof(regidx_t));
+ idx->free = free_f;
+ idx->parse = parser;
+ idx->usr = usr_dat;
+ idx->seq2regs = khash_str2int_init();
+ idx->payload_size = payload_size;
+ if ( payload_size ) idx->payload = malloc(payload_size);
+
+ if ( !fname ) return idx;
+
+ kstring_t str = {0,0,0};
+
+ htsFile *fp = hts_open(fname,"r");
+ if ( !fp ) goto error;
+
+ while ( hts_getline(fp, KS_SEP_LINE, &str) > 0 )
+ {
+ if ( regidx_insert(idx, str.s) ) goto error;
+ }
+
+ free(str.s);
+ hts_close(fp);
+ return idx;
+
+error:
+ free(str.s);
+ if ( fp ) hts_close(fp);
+ regidx_destroy(idx);
+ return NULL;
+}
+
+void regidx_destroy(regidx_t *idx)
+{
+ int i, j;
+ for (i=0; i<idx->nseq; i++)
+ {
+ reglist_t *list = &idx->seq[i];
+ if ( idx->free )
+ {
+ for (j=0; j<list->nreg; j++)
+ idx->free((char *)list->dat + idx->payload_size*j);
+ }
+ free(list->dat);
+ free(list->reg);
+ free(list->idx);
+ }
+ free(idx->seq_names);
+ free(idx->seq);
+ free(idx->str.s);
+ free(idx->payload);
+ khash_str2int_destroy_free(idx->seq2regs);
+ free(idx);
+}
+
+int _reglist_build_index(regidx_t *regidx, reglist_t *list)
+{
+ int i;
+ if ( list->unsorted )
+ {
+ if ( !regidx->payload_size )
+ qsort(list->reg,list->nreg,sizeof(reg_t),cmp_reg_ptrs);
+ else
+ {
+ reg_t **ptr = (reg_t**) malloc(sizeof(reg_t*)*list->nreg);
+ for (i=0; i<list->nreg; i++) ptr[i] = list->reg + i;
+ qsort(ptr,list->nreg,sizeof(*ptr),cmp_reg_ptrs2);
+
+ void *tmp_dat = malloc(regidx->payload_size*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ memcpy((char *)tmp_dat+i*regidx->payload_size,
+ (char *)list->dat+iori*regidx->payload_size,
+ regidx->payload_size);
+ }
+ free(list->dat);
+ list->dat = tmp_dat;
+
+ reg_t *tmp_reg = (reg_t*) malloc(sizeof(reg_t)*list->nreg);
+ for (i=0; i<list->nreg; i++)
+ {
+ size_t iori = ptr[i] - list->reg;
+ tmp_reg[i] = list->reg[iori];
+ }
+ free(ptr);
+ free(list->reg);
+ list->reg = tmp_reg;
+ list->mreg = list->nreg;
+ }
+ list->unsorted = 0;
+ }
+
+ list->nidx = 0;
+ int j,k, midx = 0;
+ for (j=0; j<list->nreg; j++)
+ {
+ int ibeg = iBIN(list->reg[j].beg);
+ int iend = iBIN(list->reg[j].end);
+ if ( midx <= iend )
+ {
+ int old_midx = midx;
+ midx = iend + 1;
+ kroundup32(midx);
+ list->idx = (uint32_t*) realloc(list->idx, midx*sizeof(uint32_t));
+ memset(list->idx+old_midx, 0, sizeof(uint32_t)*(midx-old_midx));
+ }
+ if ( ibeg==iend )
+ {
+ if ( !list->idx[ibeg] ) list->idx[ibeg] = j + 1;
+ }
+ else
+ {
+ for (k=ibeg; k<=iend; k++)
+ if ( !list->idx[k] ) list->idx[k] = j + 1;
+ }
+ if ( list->nidx < iend+1 ) list->nidx = iend+1;
+ }
+
+ return 0;
+}
+
+int regidx_overlap(regidx_t *regidx, const char *chr, uint32_t beg, uint32_t end, regitr_t *regitr)
+{
+ if ( regitr ) regitr->seq = NULL;
+
+ int iseq, ireg;
+ if ( khash_str2int_get(regidx->seq2regs, chr, &iseq)!=0 ) return 0; // no such sequence
+
+ reglist_t *list = ®idx->seq[iseq];
+ if ( !list->nreg ) return 0;
+
+ if ( list->nreg==1 )
+ {
+ if ( beg > list->reg[0].end ) return 0;
+ if ( end < list->reg[0].beg ) return 0;
+ ireg = 0;
+ }
+ else
+ {
+ if ( !list->idx )
+ _reglist_build_index(regidx,list);
+
+ int ibeg = iBIN(beg);
+ if ( ibeg >= list->nidx ) return 0; // beg is too big
+
+ // find a matching region
+ uint32_t i = list->idx[ibeg];
+ if ( !i )
+ {
+ int iend = iBIN(end);
+ if ( iend > list->nidx ) iend = list->nidx;
+ for (i=ibeg; i<iend; i++)
+ if ( list->idx[i] ) break;
+ if ( i==iend ) return 0;
+ i = list->idx[i];
+ }
+
+ for (ireg=i-1; ireg<list->nreg; ireg++)
+ {
+ if ( list->reg[ireg].beg > end ) return 0; // no match, past the query region
+ if ( list->reg[ireg].end >= beg && list->reg[ireg].beg <= end ) break; // found
+ }
+
+ if ( ireg >= list->nreg ) return 0; // no match
+ }
+
+ if ( !regitr ) return 1; // match, but no more info to save
+
+ // may need to iterate over the matching regions later
+ _itr_t *itr = (_itr_t*)regitr->itr;
+ itr->ridx = regidx;
+ itr->list = list;
+ itr->beg = beg;
+ itr->end = end;
+ itr->ireg = ireg;
+ itr->active = 0;
+
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[ireg].beg;
+ regitr->end = list->reg[ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)list->dat + regidx->payload_size*ireg;
+
+ return 1;
+}
+
+int regidx_parse_bed(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ ss = se+1;
+ *end = strtod(ss, &se) - 1;
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse bed line: %s\n", line); return -2; }
+
+ return 0;
+}
+
+int regidx_parse_tab(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ // just the chromosome name
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse tab line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse tab line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+int regidx_parse_reg(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr)
+{
+ char *ss = (char*) line;
+ while ( *ss && isspace(*ss) ) ss++;
+ if ( !*ss ) return -1; // skip blank lines
+ if ( *ss=='#' ) return -1; // skip comments
+
+ char *se = ss;
+ while ( *se && *se!=':' ) se++;
+
+ *chr_beg = ss;
+ *chr_end = se-1;
+
+ if ( !*se )
+ {
+ *beg = 0;
+ *end = MAX_COOR_0;
+ return 0;
+ }
+
+ ss = se+1;
+ *beg = strtod(ss, &se);
+ if ( ss==se ) { fprintf(pysam_stderr,"Could not parse reg line: %s\n", line); return -2; }
+ if ( *beg==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ (*beg)--;
+
+ if ( !se[0] || !se[1] )
+ *end = se[0]=='-' ? MAX_COOR_0 : *beg;
+ else
+ {
+ ss = se+1;
+ *end = strtod(ss, &se);
+ if ( ss==se ) *end = *beg;
+ else if ( *end==0 ) { fprintf(pysam_stderr,"Could not parse reg line, expected 1-based coordinate: %s\n", line); return -2; }
+ else (*end)--;
+ }
+ return 0;
+}
+
+regitr_t *regitr_init(regidx_t *regidx)
+{
+ regitr_t *regitr = (regitr_t*) calloc(1,sizeof(regitr_t));
+ regitr->itr = (_itr_t*) calloc(1,sizeof(_itr_t));
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ itr->ridx = regidx;
+ itr->list = NULL;
+ return regitr;
+}
+
+void regitr_reset(regidx_t *regidx, regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ memset(itr,0,sizeof(_itr_t));
+ itr->ridx = regidx;
+}
+
+void regitr_destroy(regitr_t *regitr)
+{
+ free(regitr->itr);
+ free(regitr);
+}
+
+int regitr_overlap(regitr_t *regitr)
+{
+ if ( !regitr->seq ) return 0;
+
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ if ( !itr->active )
+ {
+ // is this the first call after regidx_overlap?
+ itr->active = 1;
+ itr->ireg++;
+ return 1;
+ }
+
+ reglist_t *list = itr->list;
+
+ int i;
+ for (i=itr->ireg; i<list->nreg; i++)
+ {
+ if ( list->reg[i].beg > itr->end ) return 0; // no match, past the query region
+ if ( list->reg[i].end >= itr->beg && list->reg[i].beg <= itr->end ) break; // found
+ }
+
+ if ( i >= list->nreg ) return 0; // no match
+
+ itr->ireg = i + 1;
+ regitr->seq = list->seq;
+ regitr->beg = list->reg[i].beg;
+ regitr->end = list->reg[i].end;
+ if ( itr->ridx->payload_size )
+ regitr->payload = (char *)list->dat + itr->ridx->payload_size*i;
+
+ return 1;
+}
+
+int regitr_loop(regitr_t *regitr)
+{
+ _itr_t *itr = (_itr_t*) regitr->itr;
+ regidx_t *regidx = itr->ridx;
+
+ if ( !itr->list ) // first time here
+ {
+ itr->list = regidx->seq;
+ itr->ireg = 0;
+ }
+
+ size_t iseq = itr->list - regidx->seq;
+ if ( iseq >= regidx->nseq ) return 0;
+
+ if ( itr->ireg >= itr->list->nreg )
+ {
+ iseq++;
+ if ( iseq >= regidx->nseq ) return 0; // no more sequences, done
+ itr->ireg = 0;
+ itr->list = ®idx->seq[iseq];
+ }
+
+ regitr->seq = itr->list->seq;
+ regitr->beg = itr->list->reg[itr->ireg].beg;
+ regitr->end = itr->list->reg[itr->ireg].end;
+ if ( regidx->payload_size )
+ regitr->payload = (char *)itr->list->dat + regidx->payload_size*itr->ireg;
+ itr->ireg++;
+
+ return 1;
+}
+
+
+
--- /dev/null
+/*
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+/*
+ Region indexing with an optional payload.
+
+ Example of usage:
+
+ // Init the parser and print regions. In this example the payload is a
+ // pointer to a string. For the description of parse_custom and
+ // free_custom functions, see regidx_parse_f and regidx_free_f below,
+ // and for working example see test/test-regidx.c.
+ regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
+
+ // Query overlap with chr:from-to
+ regitr_t *itr = regitr_init(idx);
+ if ( regidx_overlap(idx, chr,from,to, itr) ) printf("There is an overlap!\n");
+
+ while ( regitr_overlap(itr) )
+ {
+ printf("[%d,%d] overlaps with [%d,%d], payload=%s\n", from,to,
+ itr->beg, itr->end, regitr_payload(itr,char*));
+ }
+
+ regidx_destroy(idx);
+ regitr_destroy(itr);
+
+
+ Another example, loop over all regions:
+
+ regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL);
+ regitr_t *itr = regitr_init(idx);
+
+ while ( regitr_loop(itr) )
+ printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg, itr->end);
+
+ regidx_destroy(idx);
+ regitr_destroy(itr);
+*/
+
+#ifndef __REGIDX_H__
+#define __REGIDX_H__
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REGIDX_MAX 2147483646 // maximum regidx coordinate (0-based)
+
+typedef struct _regidx_t regidx_t;
+typedef struct
+{
+ uint32_t beg,end;
+ void *payload;
+ char *seq;
+ void *itr;
+}
+regitr_t;
+
+#define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload))
+
+/*
+ * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
+ * or regidx_parse_tab below. The function is expected to set `chr_from` and
+ * `chr_to` to point to first and last character of chromosome name and set
+ * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was
+ * called with non-zero payload_size, the `payload` points to a memory
+ * location of the payload_size and `usr` is the data passed to regidx_init().
+ * Any memory allocated by the function will be freed by regidx_free_f called
+ * by regidx_destroy().
+ *
+ * Return value: 0 on success, -1 to skip a record, -2 on fatal error.
+ */
+typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, uint32_t *beg, uint32_t *end, void *payload, void *usr);
+typedef void (*regidx_free_f)(void *payload);
+
+/*
+ * A note about the parsers:
+ * - leading spaces are ignored
+ * - lines starting with "#" are ignored
+ */
+int regidx_parse_bed(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open)
+int regidx_parse_tab(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive)
+int regidx_parse_reg(const char*,char**,char**,uint32_t*,uint32_t*,void*,void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive)
+
+/*
+ * regidx_init() - creates new index
+ * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert()
+ * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
+ * the format will be autodected, currently either regidx_parse_tab (the default) or
+ * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
+ * the exact autodetection algorithm will change.
+ * @param freef: NULL or see description of regidx_parse_f
+ * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
+ * @param usr: optional user data passed to regidx_parse_f
+ *
+ * Returns index on success or NULL on error.
+ */
+regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr);
+
+/*
+ * regidx_destroy() - free memory allocated by regidx_init
+ */
+void regidx_destroy(regidx_t *idx);
+
+/*
+ * regidx_overlap() - check overlap of the location chr:from-to with regions
+ * @param beg,end: 0-based start, end coordinate (inclusive)
+ * @param itr: pointer to iterator, can be NULL if regidx_loop not needed
+ *
+ * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
+ * regions can be iterated as shown in the example above.
+ */
+int regidx_overlap(regidx_t *idx, const char *chr, uint32_t beg, uint32_t end, regitr_t *itr);
+
+/*
+ * regidx_insert() - add a new region.
+ * regidx_insert_list() - add new regions from a list
+ * regidx_push() - low level insertion of a new region
+ *
+ * Returns 0 on success or -1 on error.
+ */
+int regidx_insert(regidx_t *idx, char *line);
+int regidx_insert_list(regidx_t *idx, char *line, char delim);
+int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload);
+
+/*
+ * regidx_seq_names() - return list of all sequence names
+ */
+char **regidx_seq_names(regidx_t *idx, int *n);
+
+/*
+ * regidx_seq_nregs() - number of regions
+ * regidx_nregs() - total number of regions
+ */
+int regidx_seq_nregs(regidx_t *idx, const char *seq);
+int regidx_nregs(regidx_t *idx);
+
+/*
+ * regitr_init() - initialize an iterator. The idx parameter is required only
+ * with regitr_loop. If only regitr_overlap is called, NULL
+ * can be given.
+ *
+ * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle.
+ * Not required with regitr_overlap.
+ */
+regitr_t *regitr_init(regidx_t *idx);
+void regitr_destroy(regitr_t *itr);
+void regitr_reset(regidx_t *idx, regitr_t *itr);
+
+/*
+ * regitr_overlap() - next overlapping region
+ * Returns 0 when done or 1 when itr is set to next region
+ */
+int regitr_overlap(regitr_t *itr);
+
+/*
+ * regitr_loop() - loop over all regions
+ * Returns 0 when done or 1 when itr is set to next region
+ */
+int regitr_loop(regitr_t *itr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "bcftools.h"
+#include "smpl_ilist.h"
+
+void smpl_ilist_destroy(smpl_ilist_t *smpl)
+{
+ free(smpl->idx);
+ free(smpl);
+}
+
+smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags)
+{
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ if ( !sample_list )
+ {
+ smpl->n = bcf_hdr_nsamples(hdr);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++) smpl->idx[i] = i;
+ return smpl;
+ }
+
+ int nlist;
+ char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist);
+ if ( !list ) error("Could not parse %s\n", sample_list);
+
+ // preserve the VCF order
+ int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int));
+ for (i=0; i<nlist; i++)
+ {
+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, list[i]);
+ if ( idx>=0 )
+ {
+ tmp[idx] = 1;
+ smpl->n++;
+ }
+ else if ( flags&SMPL_STRICT )
+ error("No such sample: %s\n", list[i]);
+ }
+
+ if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n;
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+
+ int j = 0;
+ if ( sample_list[0]!='^' )
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( tmp[i] ) smpl->idx[j++] = i;
+ }
+ else
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !tmp[i] ) smpl->idx[j++] = i;
+ }
+
+ free(tmp);
+ for (i=0; i<nlist; i++) free(list[i]);
+ free(list);
+
+ return smpl;
+}
+
+smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags)
+{
+ if ( flags&SMPL_STRICT && bcf_hdr_nsamples(hdr_a)!=bcf_hdr_nsamples(hdr_b) )
+ error("Different number of samples: %d vs %d\n", bcf_hdr_nsamples(hdr_a),bcf_hdr_nsamples(hdr_b));
+
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ smpl->n = bcf_hdr_nsamples(hdr_a);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++)
+ {
+ const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i);
+ smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name);
+ if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
+ error("The sample %s is not present in the second file\n", name);
+ }
+ return smpl;
+}
+
--- /dev/null
+#include "pysam.h"
+
+/*
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "bcftools.h"
+#include "smpl_ilist.h"
+
+void smpl_ilist_destroy(smpl_ilist_t *smpl)
+{
+ free(smpl->idx);
+ free(smpl);
+}
+
+smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags)
+{
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ if ( !sample_list )
+ {
+ smpl->n = bcf_hdr_nsamples(hdr);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++) smpl->idx[i] = i;
+ return smpl;
+ }
+
+ int nlist;
+ char **list = hts_readlist(sample_list[0]=='^'?sample_list+1:sample_list, is_file, &nlist);
+ if ( !list ) error("Could not parse %s\n", sample_list);
+
+ // preserve the VCF order
+ int *tmp = (int*)calloc(bcf_hdr_nsamples(hdr),sizeof(int));
+ for (i=0; i<nlist; i++)
+ {
+ int idx = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, list[i]);
+ if ( idx>=0 )
+ {
+ tmp[idx] = 1;
+ smpl->n++;
+ }
+ else if ( flags&SMPL_STRICT )
+ error("No such sample: %s\n", list[i]);
+ }
+
+ if ( sample_list[0]=='^' ) smpl->n = bcf_hdr_nsamples(hdr) - smpl->n;
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+
+ int j = 0;
+ if ( sample_list[0]!='^' )
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( tmp[i] ) smpl->idx[j++] = i;
+ }
+ else
+ {
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !tmp[i] ) smpl->idx[j++] = i;
+ }
+
+ free(tmp);
+ for (i=0; i<nlist; i++) free(list[i]);
+ free(list);
+
+ return smpl;
+}
+
+smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags)
+{
+ if ( flags&SMPL_STRICT && bcf_hdr_nsamples(hdr_a)!=bcf_hdr_nsamples(hdr_b) )
+ error("Different number of samples: %d vs %d\n", bcf_hdr_nsamples(hdr_a),bcf_hdr_nsamples(hdr_b));
+
+ smpl_ilist_t *smpl = (smpl_ilist_t*) calloc(1,sizeof(smpl_ilist_t));
+
+ int i;
+ smpl->n = bcf_hdr_nsamples(hdr_a);
+ smpl->idx = (int*) malloc(sizeof(int)*smpl->n);
+ for (i=0; i<smpl->n; i++)
+ {
+ const char *name = bcf_hdr_int2id(hdr_a, BCF_DT_SAMPLE, i);
+ smpl->idx[i] = bcf_hdr_id2int(hdr_b, BCF_DT_SAMPLE, name);
+ if ( flags&SMPL_STRICT && smpl->idx[i]<0 )
+ error("The sample %s is not present in the second file\n", name);
+ }
+ return smpl;
+}
+
--- /dev/null
+/*
+ Copyright (C) 2016 Genome Research Ltd.
+
+ Author: Petr Danecek <pd3@sanger.ac.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+/*
+ Parse --samples and --samples-file
+*/
+
+#ifndef __SMPL_ILIST_H__
+#define __SMPL_ILIST_H__
+
+#include <htslib/vcf.h>
+
+#define SMPL_NONE 0 // flexible error recovery
+#define SMPL_STRICT 1 // samples must exist
+
+typedef struct
+{
+ int *idx; // index to bcf_hdr_t.samples
+ int n;
+}
+smpl_ilist_t;
+
+smpl_ilist_t *smpl_ilist_init(bcf_hdr_t *hdr, char *sample_list, int is_file, int flags);
+smpl_ilist_t *smpl_ilist_map(bcf_hdr_t *hdr_a, bcf_hdr_t *hdr_b, int flags);
+void smpl_ilist_destroy(smpl_ilist_t *smpl);
+
+#endif
/* tabix.c -- tabix subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
int main_tabix(int argc, char *argv[])
{
- int c, min_shift = -1, is_force = 0, is_all = 0;
- tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ int c, min_shift = -1, is_force = 0, is_all = 0, detect = 1;
+ tbx_conf_t conf = tbx_conf_gff;
while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
if (c == '0') conf.preset |= TBX_UCSC;
else if (c == 'f') is_force = 1;
else if (c == 'c') conf.meta_char = *optarg;
else if (c == 'S') conf.line_skip = atoi(optarg);
else if (c == 'p') {
- if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
- else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
- else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
- else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
else {
fprintf(stderr, "The type '%s' not recognised\n", optarg);
return 1;
+ detect = 0;
}
}
bgzf_close(fp);
free(s.s);
} else if (optind + 2 > argc) { // create index
- if ( !conf_ptr )
+ if ( detect )
{
// auto-detect file type by file name
int l = strlen(argv[optind]);
int strcasecmp(const char *s1, const char *s2);
- if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf = tbx_conf_vcf;
}
- if ( conf_ptr ) conf = *conf_ptr;
if (!is_force) {
char *fn;
FILE *fp;
- fn = (char*)alloca(strlen(argv[optind]) + 5);
+ fn = (char*)malloc(strlen(argv[optind]) + 5);
strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
if ((fp = fopen(fn, "rb")) != 0) {
fclose(fp);
+ free(fn);
fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
return 1;
}
+ free(fn);
}
if ( tbx_index_build(argv[optind], min_shift, &conf) )
{
/* tabix.c -- tabix subcommand.
Copyright (C) 2012 Broad Institute.
- Copyright (C) 2013 Genome Research Ltd.
+ Copyright (C) 2013, 2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
int main_tabix(int argc, char *argv[])
{
- int c, min_shift = -1, is_force = 0, is_all = 0;
- tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL;
+ int c, min_shift = -1, is_force = 0, is_all = 0, detect = 1;
+ tbx_conf_t conf = tbx_conf_gff;
while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0)
if (c == '0') conf.preset |= TBX_UCSC;
else if (c == 'f') is_force = 1;
else if (c == 'c') conf.meta_char = *optarg;
else if (c == 'S') conf.line_skip = atoi(optarg);
else if (c == 'p') {
- if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff;
- else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed;
- else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam;
- else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf;
+ if (strcmp(optarg, "gff") == 0) conf = tbx_conf_gff;
+ else if (strcmp(optarg, "bed") == 0) conf = tbx_conf_bed;
+ else if (strcmp(optarg, "sam") == 0) conf = tbx_conf_sam;
+ else if (strcmp(optarg, "vcf") == 0) conf = tbx_conf_vcf;
else {
fprintf(pysam_stderr, "The type '%s' not recognised\n", optarg);
return 1;
+ detect = 0;
}
}
bgzf_close(fp);
free(s.s);
} else if (optind + 2 > argc) { // create index
- if ( !conf_ptr )
+ if ( detect )
{
// auto-detect file type by file name
int l = strlen(argv[optind]);
int strcasecmp(const char *s1, const char *s2);
- if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam;
- else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf;
+ if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf = tbx_conf_gff;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf = tbx_conf_bed;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf = tbx_conf_sam;
+ else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf = tbx_conf_vcf;
}
- if ( conf_ptr ) conf = *conf_ptr;
if (!is_force) {
char *fn;
FILE *fp;
- fn = (char*)alloca(strlen(argv[optind]) + 5);
+ fn = (char*)malloc(strlen(argv[optind]) + 5);
strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi");
if ((fp = fopen(fn, "rb")) != 0) {
fclose(fp);
+ free(fn);
fprintf(pysam_stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__);
return 1;
}
+ free(fn);
}
if ( tbx_index_build(argv[optind], min_shift, &conf) )
{
*/
#include <ctype.h>
+#include <strings.h>
#include "tsv2vcf.h"
tsv_t *tsv_init(const char *str)
*/
#include <ctype.h>
+#include <strings.h>
#include "tsv2vcf.h"
tsv_t *tsv_init(const char *str)
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
#include "vcmp.h"
#include "filter.h"
#include "convert.h"
+#include "smpl_ilist.h"
struct _args_t;
#define REPLACE_MISSING 0 // replace only missing values
#define REPLACE_ALL 1 // replace both missing and existing values
-#define REPLACE_EXISTING 2 // replace only if tgt is not missing
+#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
typedef struct _annot_col_t
{
int icol, replace, number; // number: one of BCF_VL_* types
- char *hdr_key;
+ char *hdr_key_src, *hdr_key_dst;
int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
}
annot_col_t;
convert_t *set_ids;
int set_ids_replace;
+ int nsmpl_annot;
int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
int mtmpi, mtmpf, mtmps;
int mtmpi2, mtmpf2, mtmps2;
}
line->d.shared_dirty |= BCF1_DIRTY_INF;
inf->vptr = NULL;
+ inf->vptr_off = inf->vptr_len = 0;
}
}
void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
}
}
+#include "htslib/khash.h"
+KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+typedef khash_t(vdict) vdict_t;
+
static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
int i = 0, nrm = 0;
{
if ( hdr->hrec[i]->type!=type ) { i++; continue; }
bcf_hrec_t *hrec = hdr->hrec[i];
- if ( type==BCF_HL_FMT )
+ if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
{
// everything except FORMAT/GT
int id = bcf_hrec_find_key(hrec, "ID");
- if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ if ( id>=0 )
+ {
+ if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]);
+ kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
+ kh_val(d, k).info[type] |= 0xf;
+ }
}
nrm++;
hdr->nhrec--;
line->qual = strtod(str, &str);
if ( str == tab->cols[col->icol] )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return 0;
}
static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
char *str = tab->cols[col->icol];
if ( str[0]=='.' && str[1]==0 ) return 0;
- if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1);
- if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0);
+ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
+ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return -1;
}
static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL);
- bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag);
+ int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL);
+ bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag);
return 0;
}
static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
{
if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
int i;
args->tmpi2[i] = args->tmpi[ map[i] ];
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
return 0;
}
static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
if ( ntmpi < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
int i;
args->tmpf2[i] = args->tmpf[ map[i] ];
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
return 0;
}
static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
if ( ntmpf < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
lsrc++;
}
if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
// fill in any missing values in the target VCF (or all, if not present)
int i, empty = 0, nstr, mstr = args->tmpks.m;
- nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr);
+ nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr);
args->tmpks.m = mstr;
if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
{
int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
assert( ret==0 );
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
return 0;
}
static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps);
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
if ( ntmps < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 ) // field not present in dst file
{
- if ( col->replace==REPLACE_EXISTING ) return 0;
+ if ( col->replace==REPLACE_NON_MISSING ) return 0;
hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
int32_t *src = args->tmpi + nsrc*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue;
if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
for (j=0; j<nsrc; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
int32_t *dst = args->tmpi3 + nsrc*i;
int keep_ori = 0;
if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
if ( keep_ori )
{
}
static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
{
- int i, nmax = 0;
+ int i, nmax = 1;
for (i=icol_beg; i<icol_end; i++)
{
char *str = tab->cols[i], *end = str;
}
return nmax;
}
-static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- int32_t *ptr = args->tmpi + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- ptr[ival++] = bcf_int32_missing;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
- icol++;
- }
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
-}
-static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- float *ptr = args->tmpf + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- bcf_float_set_missing(ptr[ival]);
- ival++;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtod(str, &end);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
- icol++;
- }
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
-}
-static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
-
- int i, max_len = 0;
- for (i=col->icol; i<col->icol+nsmpl; i++)
- {
- int len = strlen(tab->cols[i]);
- if ( max_len < len ) max_len = len;
- }
- hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- char *ptr = args->tmps + ismpl*max_len;
- char *str = tab->cols[icol];
- i = 0;
- while ( str[i] )
- {
- ptr[i] = str[i];
- i++;
- }
- while ( i<max_len ) ptr[i++] = 0;
- icol++;
- }
- return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
-}
-static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc);
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2);
+ int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *dst = args->tmpi2 + nsrc*i;
+ int32_t *dst = args->tmpi2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
dst[0] = bcf_int32_missing;
- for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (j=1; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
{
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ int32_t *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *src = vals + nvals*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue;
- if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ // possible cases:
+ // in annot out
+ // x y x TAG,-TAG,=TAG .. REPLACE_ALL, REPLACE_NON_MISSING, SET_OR_APPEND
+ // x y y +TAG .. REPLACE_MISSING
+ // . y . =TAG .. SET_OR_APPEND
+ // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
+ // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
+ // x . . -TAG .. REPLACE_NON_MISSING
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *ori = args->tmpi2 + ndst*i;
- int32_t *dst = args->tmpi3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1;
- if ( keep_ori )
+ int32_t *ann = vals + nvals*args->sample_map[i];
+ int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line
+ int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
- {
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc);
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2);
+ int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *dst = args->tmpf2 + nsrc*i;
+ float *dst = args->tmpf2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
bcf_float_set_missing(dst[0]);
- for (j=1; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (j=1; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
{
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ float *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- float *src = args->tmpf + nsrc*args->sample_map[i];
+ float *src = vals + nvals*args->sample_map[i];
float *dst = args->tmpf2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue;
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *ori = args->tmpf2 + ndst*i;
- float *dst = args->tmpf3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- if ( keep_ori )
+ float *ann = vals + nvals*args->sample_map[i];
+ float *ori = args->tmpf2 + ndst*i; // ori vcf line
+ float *dst = args->tmpf3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
- {
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals)
{
- bcf1_t *rec = (bcf1_t*) data;
- args->tmpp[0] = args->tmps;
- int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps);
- args->tmps = args->tmpp[0]; // tmps might be realloced
- if ( ret==-3 ) return 0; // the tag is not present
- if ( ret<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot);
int i;
args->tmpp2[0] = args->tmps2;
- ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2);
+ int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2);
args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
if ( ret<=0 ) // not present in dst
{
hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ char *tmp = args->tmps2;
+ for (i=0; i<nsmpl; i++)
{
- args->tmps2[2*i] = '.';
- args->tmps2[2*i+1] = 0;
- args->tmpp2[i] = args->tmps2+2*i;
+ tmp[0] = '.';
+ tmp[1] = 0;
+ args->tmpp2[i] = tmp;
+ tmp += 2;
}
}
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ char **src = vals + args->sample_map[i];
+ char **dst = args->tmpp2 + i;
+
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
+ *dst = *src;
+ }
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return core_setter_format_int(args,line,col,args->tmpi,nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
{
- int isrc = args->sample_map[i];
- if ( isrc==-1 ) continue;
- args->tmpp2[i] = args->tmpp[isrc];
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
}
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out));
+ return core_setter_format_real(args,line,col,args->tmpf,nvals);
}
-static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples)
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
+
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+
+static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ args->tmpp[0] = args->tmps;
+ int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps);
+ args->tmps = args->tmpp[0]; // tmps might be realloced
+ if ( ret==-3 ) return 0; // the tag is not present
+ if ( ret<=0 ) return 1; // error
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
{
int i;
if ( !args->sample_names )
{
+ args->nsmpl_annot = bcf_hdr_nsamples(dst);
+
+ // tab annotation file, expecting that all samples are present: sample map not needed
+ if ( !src ) return 0;
+
int nmatch = 0, order_ok = 1;
for (i=0; i<bcf_hdr_nsamples(src); i++)
{
if ( i!=id ) order_ok = 0;
}
}
- if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok && !need_samples )
- return; // the same samples in both files
-
- if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
- if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(stderr,"%d sample(s) in common\n", nmatch);
+ if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed
+ if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
args->sample_map[i] = id; // idst -> isrc, -1 if not present
}
- return;
+ return 1;
}
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
- int nsamples = 0;
- char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples);
- for (i=0; i<nsamples; i++)
+ // possible todo: could do with smpl_ilist only
+ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT);
+ if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names);
+ char **samples = (char**) malloc(sizeof(char*)*ilist->n);
+ for (i=0; i<ilist->n; i++) samples[i] = strdup(dst->samples[i]);
+ args->nsmpl_annot = ilist->n;
+ smpl_ilist_destroy(ilist);
+ int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1;
+ if ( !src )
{
- int isrc, idst;
- char *ss = samples[i], *se = samples[i];
- while ( *se && !isspace(*se) ) se++;
- if ( !*se )
+ // tab annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
+ {
+ int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]);
+ args->sample_map[idst] = i;
+ if ( idst!=i ) need_sample_map = 1;
+ }
+ }
+ else
+ {
+ // vcf annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
{
- // only one sample name
+ int isrc, idst;
+ char *ss = samples[i], *se = samples[i];
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se )
+ {
+ // only one sample name
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+ args->sample_map[idst] = isrc;
+ if ( idst!=isrc ) need_sample_map = 1;
+ continue;
+ }
+ *se = 0;
isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+
+ ss = se+1;
+ while ( isspace(*ss) ) ss++;
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+
args->sample_map[idst] = isrc;
- continue;
+ if ( idst!=isrc ) need_sample_map = 1;
}
- *se = 0;
- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
-
- ss = se+1;
- while ( isspace(*ss) ) ss++;
- se = ss;
- while ( *se && !isspace(*se) ) se++;
-
- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
-
- args->sample_map[idst] = isrc;
}
- for (i=0; i<nsamples; i++) free(samples[i]);
+ for (i=0; i<args->nsmpl_annot; i++) free(samples[i]);
free(samples);
+ return need_sample_map;
}
static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
{
free(columns);
return str.s;
}
+static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str)
+{
+ int j, nout = 0;
+ ksprintf(str, "##%s=<", hrec->key);
+ for (j=0; j<hrec->nkeys; j++)
+ {
+ if ( !strcmp("IDX",hrec->keys[j]) ) continue;
+ if ( nout ) kputc(',',str);
+ if ( !strcmp("ID", hrec->keys[j]) )
+ ksprintf(str,"%s=%s", hrec->keys[j], tag);
+ else
+ ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
+ nout++;
+ }
+ ksprintf(str,">\n");
+}
static void init_columns(args_t *args)
{
+ int need_sample_map = 0;
+ int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
+
void *skip_fmt = NULL, *skip_info = NULL;
if ( args->tgts_is_vcf )
args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int icol = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
int replace = REPLACE_ALL;
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
- else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
+ else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
icol++;
str.l = 0;
else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( !strcasecmp("FILTER",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
if ( args->tgts_is_vcf )
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
}
else if ( !strcasecmp("QUAL",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ need_sample_map = 1;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
{
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
- if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
+ if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7;
+ else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4;
+ }
+ else
+ key_src = key_dst;
+ need_sample_map = 1;
if ( args->tgts_is_vcf )
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
}
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
if ( !args->tgts_is_vcf )
{
col->icol = icol;
- icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ icol += args->nsmpl_annot - 1;
}
else
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(key);
- if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
+ if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
}
else
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
- if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); }
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s;
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5;
+ }
+ else
+ key_src = key_dst;
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
- hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 && args->tgts_is_vcf )
- set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
+ if ( !need_sample_map )
+ {
+ free(args->sample_map);
+ args->sample_map = NULL;
+ }
+ else if ( sample_map_ok<0 )
+ error("No matching samples in source and destination file?\n");
}
static void rename_chrs(args_t *args, char *fname)
if ( args->mark_sites )
{
if ( !args->targets_fname ) error("The -a option not given\n");
- if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add..
bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
bcf_hdr_write(args->out_fh, args->hdr_out);
}
}
if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
if (args->vcmp) vcmp_destroy(args->vcmp);
for (i=0; i<args->ncols; i++)
- free(args->cols[i].hdr_key);
+ {
+ free(args->cols[i].hdr_key_src);
+ free(args->cols[i].hdr_key_dst);
+ }
free(args->cols);
for (i=0; i<args->malines; i++)
{
// there is a matching line
for (j=0; j<args->ncols; j++)
if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
}
bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?0:1);
}
}
- else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) )
+ else if ( args->files->nreaders == 2 )
{
- bcf1_t *aline = bcf_sr_get_line(args->files,1);
- for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ if ( bcf_sr_has_line(args->files,1) )
+ {
+ bcf1_t *aline = bcf_sr_get_line(args->files,1);
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0);
+ }
+ else if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0);
}
if ( args->set_ids )
{
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(stderr, " --collapse <string> matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
fprintf(stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
- int regions_is_file = 0;
+ int regions_is_file = 0, collapse = 0;
static struct option loptions[] =
{
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"annotations",required_argument,NULL,'a'},
+ {"collapse",required_argument,NULL,2},
{"include",required_argument,NULL,'i'},
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
+ case 2 :
+ if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
- args->files->collapse |= COLLAPSE_SOME;
+ args->files->collapse = collapse ? collapse : COLLAPSE_SOME;
}
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
#include "vcmp.h"
#include "filter.h"
#include "convert.h"
+#include "smpl_ilist.h"
struct _args_t;
#define REPLACE_MISSING 0 // replace only missing values
#define REPLACE_ALL 1 // replace both missing and existing values
-#define REPLACE_EXISTING 2 // replace only if tgt is not missing
+#define REPLACE_NON_MISSING 2 // replace only if tgt is not missing
#define SET_OR_APPEND 3 // set new value if missing or non-existent, append otherwise
typedef struct _annot_col_t
{
int icol, replace, number; // number: one of BCF_VL_* types
- char *hdr_key;
+ char *hdr_key_src, *hdr_key_dst;
int (*setter)(struct _args_t *, bcf1_t *, struct _annot_col_t *, void*);
}
annot_col_t;
convert_t *set_ids;
int set_ids_replace;
+ int nsmpl_annot;
int *sample_map, nsample_map, sample_is_file; // map[idst] -> isrc
int mtmpi, mtmpf, mtmps;
int mtmpi2, mtmpf2, mtmps2;
}
line->d.shared_dirty |= BCF1_DIRTY_INF;
inf->vptr = NULL;
+ inf->vptr_off = inf->vptr_len = 0;
}
}
void remove_info_tag(args_t *args, bcf1_t *line, rm_tag_t *tag)
}
}
+#include "htslib/khash.h"
+KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+typedef khash_t(vdict) vdict_t;
+
static void remove_hdr_lines(bcf_hdr_t *hdr, int type)
{
int i = 0, nrm = 0;
{
if ( hdr->hrec[i]->type!=type ) { i++; continue; }
bcf_hrec_t *hrec = hdr->hrec[i];
- if ( type==BCF_HL_FMT )
+ if ( type==BCF_HL_FMT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG )
{
// everything except FORMAT/GT
int id = bcf_hrec_find_key(hrec, "ID");
- if ( id>=0 && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ if ( id>=0 )
+ {
+ if ( type==BCF_HL_FMT && !strcmp(hrec->vals[id],"GT") ) { i++; continue; }
+ vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID];
+ khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[id]);
+ kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL;
+ kh_val(d, k).info[type] |= 0xf;
+ }
}
nrm++;
hdr->nhrec--;
line->qual = strtod(str, &str);
if ( str == tab->cols[col->icol] )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return 0;
}
static int vcf_setter_qual(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
char *str = tab->cols[col->icol];
if ( str[0]=='.' && str[1]==0 ) return 0;
- if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,1);
- if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,0);
+ if ( str[0]=='1' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,1);
+ if ( str[0]=='0' && str[1]==0 ) return bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,0);
error("Could not parse %s at %s:%d .. [%s]\n", bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
return -1;
}
static int vcf_setter_info_flag(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key,NULL,NULL);
- bcf_update_info_flag(args->hdr_out,line,col->hdr_key,NULL,flag);
+ int flag = bcf_get_info_flag(args->files->readers[1].header,rec,col->hdr_key_src,NULL,NULL);
+ bcf_update_info_flag(args->hdr_out,line,col->hdr_key_dst,NULL,flag);
return 0;
}
static int setter_ARinfo_int32(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpi)
{
if ( col->number==BCF_VL_A && ntmpi!=nals-1 && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpi!=nals && (ntmpi!=1 || args->tmpi[0]!=bcf_int32_missing || args->tmpi[1]!=bcf_int32_vector_end) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpi,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ntmpi2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ntmpi2 < ndst ) hts_expand(int32_t,ndst,args->mtmpi2,args->tmpi2);
int i;
args->tmpi2[i] = args->tmpi[ map[i] ];
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst);
return 0;
}
static int setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int vcf_setter_info_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
+ int ntmpi = bcf_get_info_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
if ( ntmpi < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key, &args->tmpi2, &args->mtmpi2);
+ int ret = bcf_get_info_int32(args->hdr, line, col->hdr_key_dst, &args->tmpi2, &args->mtmpi2);
if ( ret>0 && args->tmpi2[0]!=bcf_int32_missing ) return 0;
}
- bcf_update_info_int32(args->hdr_out,line,col->hdr_key,args->tmpi,ntmpi);
+ bcf_update_info_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi,ntmpi);
return 0;
}
static int setter_ARinfo_real(args_t *args, bcf1_t *line, annot_col_t *col, int nals, char **als, int ntmpf)
{
if ( col->number==BCF_VL_A && ntmpf!=nals-1 && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && ntmpf!=nals && (ntmpf!=1 || !bcf_float_is_missing(args->tmpf[0]) || !bcf_float_is_vector_end(args->tmpf[0])) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", ntmpf,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
if ( !map ) error("REF alleles not compatible at %s:%d\n");
// fill in any missing values in the target VCF (or all, if not present)
- int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ntmpf2 = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ntmpf2 < ndst ) hts_expand(float,ndst,args->mtmpf2,args->tmpf2);
int i;
args->tmpf2[i] = args->tmpf[ map[i] ];
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst);
return 0;
}
static int setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
static int vcf_setter_info_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
+ int ntmpf = bcf_get_info_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
if ( ntmpf < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_float(args->hdr, line, col->hdr_key, &args->tmpf2, &args->mtmpf2);
+ int ret = bcf_get_info_float(args->hdr, line, col->hdr_key_dst, &args->tmpf2, &args->mtmpf2);
if ( ret>0 && !bcf_float_is_missing(args->tmpf2[0]) ) return 0;
}
- bcf_update_info_float(args->hdr_out,line,col->hdr_key,args->tmpf,ntmpf);
+ bcf_update_info_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf,ntmpf);
return 0;
}
int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst); // see vcfmerge.c
lsrc++;
}
if ( col->number==BCF_VL_A && nsrc!=nals-1 && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
else if ( col->number==BCF_VL_R && nsrc!=nals && (nsrc!=1 || args->tmps[0]!='.' || args->tmps[1]!=0 ) )
- error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("Incorrect number of values (%d) for the %s tag at %s:%d\n", nsrc,col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
int ndst = col->number==BCF_VL_A ? line->n_allele - 1 : line->n_allele;
int *map = vcmp_map_ARvalues(args->vcmp,ndst,nals,als,line->n_allele,line->d.allele);
// fill in any missing values in the target VCF (or all, if not present)
int i, empty = 0, nstr, mstr = args->tmpks.m;
- nstr = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmpks.s, &mstr);
+ nstr = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmpks.s, &mstr);
args->tmpks.m = mstr;
if ( nstr<0 || (nstr==1 && args->tmpks.s[0]=='.' && args->tmpks.s[1]==0) )
{
int ret = copy_string_field(args->tmps,map[i],lsrc,&args->tmpks,i);
assert( ret==0 );
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmpks.s);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmpks.s);
return 0;
}
static int setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_info_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
{
bcf1_t *rec = (bcf1_t*) data;
- int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmps,&args->mtmps);
+ int ntmps = bcf_get_info_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmps,&args->mtmps);
if ( ntmps < 0 ) return 0; // nothing to add
if ( col->number==BCF_VL_A || col->number==BCF_VL_R )
if ( col->replace==REPLACE_MISSING )
{
- int ret = bcf_get_info_string(args->hdr, line, col->hdr_key, &args->tmps2, &args->mtmps2);
+ int ret = bcf_get_info_string(args->hdr, line, col->hdr_key_dst, &args->tmps2, &args->mtmps2);
if ( ret>0 && (args->tmps2[0]!='.' || args->tmps2[1]!=0) ) return 0;
}
- bcf_update_info_string(args->hdr_out,line,col->hdr_key,args->tmps);
+ bcf_update_info_string(args->hdr_out,line,col->hdr_key_dst,args->tmps);
return 0;
}
static int vcf_setter_format_gt(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 ) // field not present in dst file
{
- if ( col->replace==REPLACE_EXISTING ) return 0;
+ if ( col->replace==REPLACE_NON_MISSING ) return 0;
hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
int32_t *src = args->tmpi + nsrc*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(dst[0]) ) continue;
+ if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(dst[0]) ) continue;
if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(dst[0]) ) continue;
for (j=0; j<nsrc; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
int32_t *dst = args->tmpi3 + nsrc*i;
int keep_ori = 0;
if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
+ else if ( col->replace==REPLACE_NON_MISSING && bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
else if ( col->replace==REPLACE_MISSING && !bcf_gt_is_missing(ori[0]) ) keep_ori = 1;
if ( keep_ori )
{
}
static int count_vals(annot_line_t *tab, int icol_beg, int icol_end)
{
- int i, nmax = 0;
+ int i, nmax = 1;
for (i=icol_beg; i<icol_end; i++)
{
char *str = tab->cols[i], *end = str;
}
return nmax;
}
-static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(int32_t,nvals*nsmpl,args->mtmpi,args->tmpi);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- int32_t *ptr = args->tmpi + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- ptr[ival++] = bcf_int32_missing;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtol(str, &end, 10);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
- icol++;
- }
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsmpl*nvals);
-}
-static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
- int nvals = count_vals(tab,col->icol,col->icol+nsmpl);
- assert( nvals>0 );
- hts_expand(float,nvals*nsmpl,args->mtmpf,args->tmpf);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- float *ptr = args->tmpf + ismpl*nvals;
- int ival = 0;
-
- char *str = tab->cols[icol];
- while ( *str )
- {
- if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
- {
- bcf_float_set_missing(ptr[ival]);
- ival++;
- str += str[1] ? 2 : 1;
- continue;
- }
-
- char *end = str;
- ptr[ival] = strtod(str, &end);
- if ( end==str )
- error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
-
- ival++;
- str = *end ? end+1 : end;
- }
- while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
- icol++;
- }
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsmpl*nvals);
-}
-static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
-{
- annot_line_t *tab = (annot_line_t*) data;
- int nsmpl = bcf_hdr_nsamples(args->hdr_out);
- assert( col->icol+nsmpl <= tab->ncols );
-
- int i, max_len = 0;
- for (i=col->icol; i<col->icol+nsmpl; i++)
- {
- int len = strlen(tab->cols[i]);
- if ( max_len < len ) max_len = len;
- }
- hts_expand(char,max_len*nsmpl,args->mtmps,args->tmps);
-
- int icol = col->icol, ismpl;
- for (ismpl=0; ismpl<nsmpl; ismpl++)
- {
- char *ptr = args->tmps + ismpl*max_len;
- char *str = tab->cols[icol];
- i = 0;
- while ( str[i] )
- {
- ptr[i] = str[i];
- i++;
- }
- while ( i<max_len ) ptr[i++] = 0;
- icol++;
- }
- return bcf_update_format_char(args->hdr_out,line,col->hdr_key,args->tmps,nsmpl*max_len);
-}
-static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, int32_t *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key,&args->tmpi,&args->mtmpi);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi,nsrc);
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key,&args->tmpi2,&args->mtmpi2);
+ int i, j, ndst = bcf_get_format_int32(args->hdr,line,col->hdr_key_dst,&args->tmpi2,&args->mtmpi2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi2, args->tmpi2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *dst = args->tmpi2 + nsrc*i;
+ int32_t *dst = args->tmpi2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
dst[0] = bcf_int32_missing;
- for (j=1; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (j=1; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
{
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ int32_t *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
+ int32_t *src = vals + nvals*args->sample_map[i];
int32_t *dst = args->tmpi2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && dst[0]==bcf_int32_missing ) continue;
- if ( col->replace==REPLACE_MISSING && dst[0]!=bcf_int32_missing ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ // possible cases:
+ // in annot out
+ // x y x TAG,-TAG,=TAG .. REPLACE_ALL, REPLACE_NON_MISSING, SET_OR_APPEND
+ // x y y +TAG .. REPLACE_MISSING
+ // . y . =TAG .. SET_OR_APPEND
+ // . y y TAG,+TAG,-TAG .. REPLACE_ALL, REPLACE_MISSING, REPLACE_NON_MISSING
+ // x . x TAG,+TAG .. REPLACE_ALL, REPLACE_MISSING
+ // x . . -TAG .. REPLACE_NON_MISSING
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( dst[0]==bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( dst[0]!=bcf_int32_missing ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( src[0]==bcf_int32_missing ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) dst[j] = bcf_int32_vector_end;
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(int32_t, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
+ hts_expand(int32_t, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpi3, args->tmpi3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- int32_t *ori = args->tmpi2 + ndst*i;
- int32_t *dst = args->tmpi3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && ori[0]==bcf_int32_missing ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && ori[0]!=bcf_int32_missing ) keep_ori = 1;
- if ( keep_ori )
+ int32_t *ann = vals + nvals*args->sample_map[i];
+ int32_t *ori = args->tmpi2 + ndst*i; // ori vcf line
+ int32_t *dst = args->tmpi3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( ori[0]==bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( ori[0]!=bcf_int32_missing ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( ann[0]==bcf_int32_missing ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) dst[j] = bcf_int32_vector_end;
+ for (; j<nvals; j++) dst[j] = bcf_int32_vector_end;
}
else
- {
- int32_t *src = args->tmpi + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_int32(args->hdr_out,line,col->hdr_key,args->tmpi3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_int32(args->hdr_out,line,col->hdr_key_dst,args->tmpi3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, float *vals, int nvals)
{
- bcf1_t *rec = (bcf1_t*) data;
- int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key,&args->tmpf,&args->mtmpf);
- if ( nsrc==-3 ) return 0; // the tag is not present
- if ( nsrc<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf,nsrc);
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,vals,nvals*args->nsmpl_annot);
- int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key,&args->tmpf2,&args->mtmpf2);
+ int i, j, ndst = bcf_get_format_float(args->hdr,line,col->hdr_key_dst,&args->tmpf2,&args->mtmpf2);
if ( ndst > 0 ) ndst /= bcf_hdr_nsamples(args->hdr_out);
- nsrc /= bcf_hdr_nsamples(args->files->readers[1].header);
if ( ndst<=0 )
{
- if ( col->replace==REPLACE_EXISTING ) return 0; // overwrite only if present
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
+ if ( col->replace==REPLACE_NON_MISSING ) return 0; // overwrite only if present
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf2, args->tmpf2);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *dst = args->tmpf2 + nsrc*i;
+ float *dst = args->tmpf2 + nvals*i;
if ( args->sample_map[i]==-1 )
{
bcf_float_set_missing(dst[0]);
- for (j=1; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (j=1; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
{
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ float *src = vals + nvals*args->sample_map[i];
+ for (j=0; j<nvals; j++) dst[j] = src[j];
}
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,nvals*bcf_hdr_nsamples(args->hdr_out));
}
- else if ( ndst >= nsrc )
+ else if ( ndst >= nvals )
{
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
if ( args->sample_map[i]==-1 ) continue;
- float *src = args->tmpf + nsrc*args->sample_map[i];
+ float *src = vals + nvals*args->sample_map[i];
float *dst = args->tmpf2 + ndst*i;
- if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(dst[0]) ) continue;
- if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(dst[0]) ) continue;
- for (j=0; j<nsrc; j++) dst[j] = src[j];
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(dst[0]) ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(src[0]) ) continue; }
+ for (j=0; j<nvals; j++) dst[j] = src[j];
for (; j<ndst; j++) bcf_float_set_vector_end(dst[j]);
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf2,ndst*bcf_hdr_nsamples(args->hdr_out));
}
- else // ndst < nsrc
+ else // ndst < nvals
{
- hts_expand(float, nsrc*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
+ hts_expand(float, nvals*bcf_hdr_nsamples(args->hdr_out), args->mtmpf3, args->tmpf3);
for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
{
- float *ori = args->tmpf2 + ndst*i;
- float *dst = args->tmpf3 + nsrc*i;
- int keep_ori = 0;
- if ( args->sample_map[i]==-1 ) keep_ori = 1;
- else if ( col->replace==REPLACE_EXISTING && bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- else if ( col->replace==REPLACE_MISSING && !bcf_float_is_missing(ori[0]) ) keep_ori = 1;
- if ( keep_ori )
+ float *ann = vals + nvals*args->sample_map[i];
+ float *ori = args->tmpf2 + ndst*i; // ori vcf line
+ float *dst = args->tmpf3 + nvals*i; // expanded buffer
+ int use_new_ann = 1;
+ if ( args->sample_map[i]==-1 ) use_new_ann = 0;
+ else if ( col->replace==REPLACE_NON_MISSING ) { if ( bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( !bcf_float_is_missing(ori[0]) ) use_new_ann = 0; }
+ else if ( col->replace==REPLACE_ALL ) { if ( bcf_float_is_missing(ann[0]) ) use_new_ann = 0; }
+ if ( !use_new_ann )
{
for (j=0; j<ndst; j++) dst[j] = ori[j];
- for (; j<nsrc; j++) bcf_float_set_vector_end(dst[j]);
+ for (; j<nvals; j++) bcf_float_set_vector_end(dst[j]);
}
else
- {
- float *src = args->tmpf + nsrc*args->sample_map[i];
- for (j=0; j<nsrc; j++) dst[j] = src[j];
- }
+ for (j=0; j<nvals; j++) dst[j] = ann[j];
}
- return bcf_update_format_float(args->hdr_out,line,col->hdr_key,args->tmpf3,nsrc*bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_float(args->hdr_out,line,col->hdr_key_dst,args->tmpf3,nvals*bcf_hdr_nsamples(args->hdr_out));
}
}
-static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+static int core_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, char **vals)
{
- bcf1_t *rec = (bcf1_t*) data;
- args->tmpp[0] = args->tmps;
- int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key,&args->tmpp,&args->mtmps);
- args->tmps = args->tmpp[0]; // tmps might be realloced
- if ( ret==-3 ) return 0; // the tag is not present
- if ( ret<=0 ) return 1; // error
-
if ( !args->sample_map )
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp,bcf_hdr_nsamples(args->hdr_out));
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)vals,args->nsmpl_annot);
int i;
args->tmpp2[0] = args->tmps2;
- ret = bcf_get_format_string(args->hdr,line,col->hdr_key,&args->tmpp2,&args->mtmps2);
+ int ret = bcf_get_format_string(args->hdr,line,col->hdr_key_dst,&args->tmpp2,&args->mtmps2);
args->tmps2 = args->tmpp2[0]; // tmps2 might be realloced
+ int nsmpl = bcf_hdr_nsamples(args->hdr_out);
if ( ret<=0 ) // not present in dst
{
hts_expand(char,bcf_hdr_nsamples(args->hdr_out)*2,args->mtmps2,args->tmps2);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ char *tmp = args->tmps2;
+ for (i=0; i<nsmpl; i++)
{
- args->tmps2[2*i] = '.';
- args->tmps2[2*i+1] = 0;
- args->tmpp2[i] = args->tmps2+2*i;
+ tmp[0] = '.';
+ tmp[1] = 0;
+ args->tmpp2[i] = tmp;
+ tmp += 2;
}
}
+ for (i=0; i<nsmpl; i++)
+ {
+ if ( args->sample_map[i]==-1 ) continue;
+ char **src = vals + args->sample_map[i];
+ char **dst = args->tmpp2 + i;
+
+ if ( col->replace==REPLACE_NON_MISSING ) { if ( (*dst)[0]=='.' && (*dst)[1]==0 ) continue; }
+ else if ( col->replace==REPLACE_MISSING ) { if ( (*dst)[0]!='.' || (*dst)[1]!=0 ) continue; }
+ else if ( col->replace==REPLACE_ALL ) { if ( (*src)[0]=='.' && (*src)[1]==0 ) continue; }
+ *dst = *src;
+ }
+ return bcf_update_format_string(args->hdr_out,line,col->hdr_key_dst,(const char**)args->tmpp2,nsmpl);
+}
+static int setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(int32_t,nvals*args->nsmpl_annot,args->mtmpi,args->tmpi);
+
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ {
+ int32_t *ptr = args->tmpi + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ ptr[ival++] = bcf_int32_missing;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtol(str, &end, 10);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) ptr[ival++] = bcf_int32_vector_end;
+ icol++;
+ }
+ return core_setter_format_int(args,line,col,args->tmpi,nvals);
+}
+static int setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+ int nvals = count_vals(tab,col->icol,col->icol+args->nsmpl_annot);
+ hts_expand(float,nvals*args->nsmpl_annot,args->mtmpf,args->tmpf);
- for (i=0; i<bcf_hdr_nsamples(args->hdr_out); i++)
+ int icol = col->icol, ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
{
- int isrc = args->sample_map[i];
- if ( isrc==-1 ) continue;
- args->tmpp2[i] = args->tmpp[isrc];
+ float *ptr = args->tmpf + ismpl*nvals;
+ int ival = 0;
+
+ char *str = tab->cols[icol];
+ while ( *str )
+ {
+ if ( str[0]=='.' && (!str[1] || str[1]==',') ) // missing value
+ {
+ bcf_float_set_missing(ptr[ival]);
+ ival++;
+ str += str[1] ? 2 : 1;
+ continue;
+ }
+
+ char *end = str;
+ ptr[ival] = strtod(str, &end);
+ if ( end==str )
+ error("Could not parse %s at %s:%d .. [%s]\n", col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1,tab->cols[col->icol]);
+
+ ival++;
+ str = *end ? end+1 : end;
+ }
+ while ( ival<nvals ) { bcf_float_set_vector_end(ptr[ival]); ival++; }
+ icol++;
}
- return bcf_update_format_string(args->hdr_out,line,col->hdr_key,(const char**)args->tmpp2,bcf_hdr_nsamples(args->hdr_out));
+ return core_setter_format_real(args,line,col,args->tmpf,nvals);
}
-static void set_samples(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst, int need_samples)
+static int setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ annot_line_t *tab = (annot_line_t*) data;
+ if ( col->icol+args->nsmpl_annot > tab->ncols )
+ error("Incorrect number of values for %s at %s:%d\n",col->hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ int ismpl;
+ for (ismpl=0; ismpl<args->nsmpl_annot; ismpl++)
+ args->tmpp[ismpl] = tab->cols[col->icol + ismpl];
+
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int vcf_setter_format_int(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_int32(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpi,&args->mtmpi);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_int(args,line,col,args->tmpi,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+static int vcf_setter_format_real(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ int nsrc = bcf_get_format_float(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpf,&args->mtmpf);
+ if ( nsrc==-3 ) return 0; // the tag is not present
+ if ( nsrc<=0 ) return 1; // error
+ return core_setter_format_real(args,line,col,args->tmpf,nsrc/bcf_hdr_nsamples(args->files->readers[1].header));
+}
+
+static int vcf_setter_format_str(args_t *args, bcf1_t *line, annot_col_t *col, void *data)
+{
+ bcf1_t *rec = (bcf1_t*) data;
+ args->tmpp[0] = args->tmps;
+ int ret = bcf_get_format_string(args->files->readers[1].header,rec,col->hdr_key_src,&args->tmpp,&args->mtmps);
+ args->tmps = args->tmpp[0]; // tmps might be realloced
+ if ( ret==-3 ) return 0; // the tag is not present
+ if ( ret<=0 ) return 1; // error
+ return core_setter_format_str(args,line,col,args->tmpp);
+}
+static int init_sample_map(args_t *args, bcf_hdr_t *src, bcf_hdr_t *dst)
{
int i;
if ( !args->sample_names )
{
+ args->nsmpl_annot = bcf_hdr_nsamples(dst);
+
+ // tab annotation file, expecting that all samples are present: sample map not needed
+ if ( !src ) return 0;
+
int nmatch = 0, order_ok = 1;
for (i=0; i<bcf_hdr_nsamples(src); i++)
{
if ( i!=id ) order_ok = 0;
}
}
- if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok && !need_samples )
- return; // the same samples in both files
-
- if ( !nmatch ) error("No matching samples found in the source and the destination file\n");
- if ( nmatch!=bcf_hdr_nsamples(src) || nmatch!=bcf_hdr_nsamples(dst) ) fprintf(pysam_stderr,"%d sample(s) in common\n", nmatch);
+ if ( bcf_hdr_nsamples(src)==bcf_hdr_nsamples(dst) && nmatch==bcf_hdr_nsamples(src) && order_ok ) return 0; // not needed
+ if ( !nmatch ) return -1; // No matching samples found in the source and the destination file
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
int id = bcf_hdr_id2int(src, BCF_DT_SAMPLE, dst->samples[i]);
args->sample_map[i] = id; // idst -> isrc, -1 if not present
}
- return;
+ return 1;
}
args->nsample_map = bcf_hdr_nsamples(dst);
args->sample_map = (int*) malloc(sizeof(int)*args->nsample_map);
for (i=0; i<args->nsample_map; i++) args->sample_map[i] = -1;
- int nsamples = 0;
- char **samples = hts_readlist(args->sample_names, args->sample_is_file, &nsamples);
- for (i=0; i<nsamples; i++)
+ // possible todo: could do with smpl_ilist only
+ smpl_ilist_t *ilist = smpl_ilist_init(dst, args->sample_names, args->sample_is_file, SMPL_STRICT);
+ if ( !ilist || !ilist->n ) error("Could not parse: %s\n", args->sample_names);
+ char **samples = (char**) malloc(sizeof(char*)*ilist->n);
+ for (i=0; i<ilist->n; i++) samples[i] = strdup(dst->samples[i]);
+ args->nsmpl_annot = ilist->n;
+ smpl_ilist_destroy(ilist);
+ int need_sample_map = args->nsmpl_annot==bcf_hdr_nsamples(dst) ? 0 : 1;
+ if ( !src )
{
- int isrc, idst;
- char *ss = samples[i], *se = samples[i];
- while ( *se && !isspace(*se) ) se++;
- if ( !*se )
+ // tab annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
+ {
+ int idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE, samples[i]);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", samples[i]);
+ args->sample_map[idst] = i;
+ if ( idst!=i ) need_sample_map = 1;
+ }
+ }
+ else
+ {
+ // vcf annotation file
+ for (i=0; i<args->nsmpl_annot; i++)
{
- // only one sample name
+ int isrc, idst;
+ char *ss = samples[i], *se = samples[i];
+ while ( *se && !isspace(*se) ) se++;
+ if ( !*se )
+ {
+ // only one sample name
+ isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
+ if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+ idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
+ if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+ args->sample_map[idst] = isrc;
+ if ( idst!=isrc ) need_sample_map = 1;
+ continue;
+ }
+ *se = 0;
isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
+
+ ss = se+1;
+ while ( isspace(*ss) ) ss++;
+ se = ss;
+ while ( *se && !isspace(*se) ) se++;
+
idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
+
args->sample_map[idst] = isrc;
- continue;
+ if ( idst!=isrc ) need_sample_map = 1;
}
- *se = 0;
- isrc = bcf_hdr_id2int(src, BCF_DT_SAMPLE,ss);
- if ( isrc==-1 ) error("Sample \"%s\" not found in the source file\n", ss);
-
- ss = se+1;
- while ( isspace(*ss) ) ss++;
- se = ss;
- while ( *se && !isspace(*se) ) se++;
-
- idst = bcf_hdr_id2int(dst, BCF_DT_SAMPLE,ss);
- if ( idst==-1 ) error("Sample \"%s\" not found in the destination file\n", ss);
-
- args->sample_map[idst] = isrc;
}
- for (i=0; i<nsamples; i++) free(samples[i]);
+ for (i=0; i<args->nsmpl_annot; i++) free(samples[i]);
free(samples);
+ return need_sample_map;
}
static char *columns_complement(char *columns, void **skip_info, void **skip_fmt)
{
free(columns);
return str.s;
}
+static void bcf_hrec_format_rename(bcf_hrec_t *hrec, char *tag, kstring_t *str)
+{
+ int j, nout = 0;
+ ksprintf(str, "##%s=<", hrec->key);
+ for (j=0; j<hrec->nkeys; j++)
+ {
+ if ( !strcmp("IDX",hrec->keys[j]) ) continue;
+ if ( nout ) kputc(',',str);
+ if ( !strcmp("ID", hrec->keys[j]) )
+ ksprintf(str,"%s=%s", hrec->keys[j], tag);
+ else
+ ksprintf(str,"%s=%s", hrec->keys[j], hrec->vals[j]);
+ nout++;
+ }
+ ksprintf(str,">\n");
+}
static void init_columns(args_t *args)
{
+ int need_sample_map = 0;
+ int sample_map_ok = init_sample_map(args, args->tgts_is_vcf?args->files->readers[1].header:NULL, args->hdr);
+
void *skip_fmt = NULL, *skip_info = NULL;
if ( args->tgts_is_vcf )
args->columns = columns_complement(args->columns, &skip_info, &skip_fmt);
kstring_t str = {0,0,0}, tmp = {0,0,0};
char *ss = args->columns, *se = ss;
args->ncols = 0;
- int icol = -1, has_fmt_str = 0, force_samples = -1;
+ int icol = -1, has_fmt_str = 0;
while ( *ss )
{
if ( *se && *se!=',' ) { se++; continue; }
int replace = REPLACE_ALL;
if ( *ss=='+' ) { replace = REPLACE_MISSING; ss++; }
- else if ( *ss=='-' ) { replace = REPLACE_EXISTING; ss++; }
+ else if ( *ss=='-' ) { replace = REPLACE_NON_MISSING; ss++; }
else if ( *ss=='=' ) { replace = SET_OR_APPEND; ss++; }
icol++;
str.l = 0;
else if ( !strcasecmp("ALT",str.s) ) args->alt_idx = icol;
else if ( !strcasecmp("ID",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -ID feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -ID feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_id : setter_id;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( !strcasecmp("FILTER",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -FILTER feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_filter : setter_filter;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
if ( args->tgts_is_vcf )
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
}
else if ( !strcasecmp("QUAL",str.s) )
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -QUAL feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =QUAL feature has not been implemented yet.\n");
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
col->setter = args->tgts_is_vcf ? vcf_setter_qual : setter_qual;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(str.s);
+ col->hdr_key_dst = strdup(str.s);
}
else if ( args->tgts_is_vcf && !strcasecmp("INFO",str.s) ) // All INFO fields
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
int j;
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
else if ( args->tgts_is_vcf && (!strcasecmp("FORMAT",str.s) || !strcasecmp("FMT",str.s)) ) // All FORMAT fields
{
bcf_hdr_t *tgts_hdr = args->files->readers[1].header;
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ need_sample_map = 1;
int j;
for (j=0; j<tgts_hdr->nhrec; j++)
{
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(hrec->vals[k]);
- if ( !strcasecmp("GT",col->hdr_key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(hrec->vals[k]);
+ col->hdr_key_dst = strdup(hrec->vals[k]);
+ if ( !strcasecmp("GT",col->hdr_key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
}
else if ( !strncasecmp("FORMAT/",str.s, 7) || !strncasecmp("FMT/",str.s,4) )
{
- char *key = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
- if ( force_samples<0 ) force_samples = replace;
- if ( force_samples>=0 && replace!=REPLACE_ALL ) force_samples = replace;
+ char *key_dst = str.s + (!strncasecmp("FMT/",str.s,4) ? 4 : 7);
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("FORMAT/",key_src,7) ) key_src += 7;
+ else if ( !strncasecmp("FMT/",key_src,4) ) key_src += 4;
+ }
+ else
+ key_src = key_dst;
+ need_sample_map = 1;
if ( args->tgts_is_vcf )
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_FMT, "ID", key_src, NULL);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
}
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key);
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_FMT,hdr_id) )
error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
args->ncols++; args->cols = (annot_col_t*) realloc(args->cols,sizeof(annot_col_t)*args->ncols);
if ( !args->tgts_is_vcf )
{
col->icol = icol;
- icol += bcf_hdr_nsamples(args->hdr_out) - 1;
+ icol += args->nsmpl_annot - 1;
}
else
col->icol = -1;
col->replace = replace;
- col->hdr_key = strdup(key);
- if ( !strcasecmp("GT",key) ) col->setter = vcf_setter_format_gt;
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
+ if ( !strcasecmp("GT",key_src) ) col->setter = vcf_setter_format_gt;
else
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_FMT,hdr_id) )
{
}
else
{
- if ( replace==REPLACE_EXISTING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
+ if ( replace==REPLACE_NON_MISSING ) error("Apologies, the -INFO/TAG feature has not been implemented yet.\n");
if ( replace==SET_OR_APPEND ) error("Apologies, the =INFO/TAG feature has not been implemented yet.\n");
- if ( !strncasecmp("INFO/",str.s,5) ) { memmove(str.s,str.s+5,str.l-4); }
- int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ char *key_dst = !strncasecmp("INFO/",str.s,5) ? str.s + 5 : str.s;
+ char *key_src = strstr(key_dst,":=");
+ if ( key_src )
+ {
+ *key_src = 0;
+ key_src += 2;
+ if ( !strncasecmp("INFO/",key_src,5) ) key_src += 5;
+ }
+ else
+ key_src = key_dst;
+ int hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
if ( !bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
if ( args->tgts_is_vcf ) // reading annotations from a VCF, add a new header line
{
- bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", str.s, NULL);
+ bcf_hrec_t *hrec = bcf_hdr_get_hrec(args->files->readers[1].header, BCF_HL_INFO, "ID", key_src, NULL);
if ( !hrec ) error("The tag \"%s\" is not defined in %s\n", str.s,args->files->readers[1].fname);
tmp.l = 0;
- bcf_hrec_format(hrec, &tmp);
+ bcf_hrec_format_rename(hrec, key_dst, &tmp);
bcf_hdr_append(args->hdr_out, tmp.s);
bcf_hdr_sync(args->hdr_out);
- hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, str.s);
+ hdr_id = bcf_hdr_id2int(args->hdr_out, BCF_DT_ID, key_dst);
}
else
- error("The tag \"%s\" is not defined in %s\n", str.s, args->targets_fname);
+ error("The tag \"%s\" is not defined in %s\n", key_src, args->targets_fname);
assert( bcf_hdr_idinfo_exists(args->hdr_out,BCF_HL_INFO,hdr_id) );
}
annot_col_t *col = &args->cols[args->ncols-1];
col->icol = icol;
col->replace = replace;
- col->hdr_key = strdup(str.s);
+ col->hdr_key_src = strdup(key_src);
+ col->hdr_key_dst = strdup(key_dst);
col->number = bcf_hdr_id2length(args->hdr_out,BCF_HL_INFO,hdr_id);
switch ( bcf_hdr_id2type(args->hdr_out,BCF_HL_INFO,hdr_id) )
{
args->tmpp = (char**)malloc(sizeof(char*)*n);
args->tmpp2 = (char**)malloc(sizeof(char*)*n);
}
- if ( force_samples>=0 && args->tgts_is_vcf )
- set_samples(args, args->files->readers[1].header, args->hdr, force_samples==REPLACE_ALL ? 0 : 1);
+ if ( !need_sample_map )
+ {
+ free(args->sample_map);
+ args->sample_map = NULL;
+ }
+ else if ( sample_map_ok<0 )
+ error("No matching samples in source and destination file?\n");
}
static void rename_chrs(args_t *args, char *fname)
if ( args->mark_sites )
{
if ( !args->targets_fname ) error("The -a option not given\n");
- if ( args->tgts_is_vcf ) error("Apologies, this has not been implemented yet: -a is a VCF\n"); // very easy to add..
bcf_hdr_printf(args->hdr_out,"##INFO=<ID=%s,Number=0,Type=Flag,Description=\"Sites %slisted in %s\">",
args->mark_sites,args->mark_sites_logic==MARK_LISTED?"":"not ",args->mark_sites);
}
args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p);
bcf_hdr_write(args->out_fh, args->hdr_out);
}
}
if ( args->hdr_out ) bcf_hdr_destroy(args->hdr_out);
if (args->vcmp) vcmp_destroy(args->vcmp);
for (i=0; i<args->ncols; i++)
- free(args->cols[i].hdr_key);
+ {
+ free(args->cols[i].hdr_key_src);
+ free(args->cols[i].hdr_key_dst);
+ }
free(args->cols);
for (i=0; i<args->malines; i++)
{
// there is a matching line
for (j=0; j<args->ncols; j++)
if ( args->cols[j].setter(args,line,&args->cols[j],&args->alines[i]) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
}
bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,i<args->nalines?0:1);
}
}
- else if ( args->files->nreaders == 2 && bcf_sr_has_line(args->files,1) )
+ else if ( args->files->nreaders == 2 )
{
- bcf1_t *aline = bcf_sr_get_line(args->files,1);
- for (j=0; j<args->ncols; j++)
- if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
- error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key,bcf_seqname(args->hdr,line),line->pos+1);
+ if ( bcf_sr_has_line(args->files,1) )
+ {
+ bcf1_t *aline = bcf_sr_get_line(args->files,1);
+ for (j=0; j<args->ncols; j++)
+ if ( args->cols[j].setter(args,line,&args->cols[j],aline) )
+ error("fixme: Could not set %s at %s:%d\n", args->cols[j].hdr_key_src,bcf_seqname(args->hdr,line),line->pos+1);
+
+ if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL,args->mark_sites_logic==MARK_LISTED ? 1 : 0);
+ }
+ else if ( args->mark_sites )
+ bcf_update_info_flag(args->hdr_out,line,args->mark_sites,NULL, args->mark_sites_logic==MARK_UNLISTED ? 1 : 0);
}
if ( args->set_ids )
{
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Options:\n");
fprintf(pysam_stderr, " -a, --annotations <file> VCF file or tabix-indexed file with annotations: CHR\\tPOS[\\tVALUE]+\n");
+ fprintf(pysam_stderr, " --collapse <string> matching records by <snps|indels|both|all|some|none>, see man page for details [some]\n");
fprintf(pysam_stderr, " -c, --columns <list> list of columns in the annotation file, e.g. CHROM,POS,REF,ALT,-,INFO/TAG. See man page for details\n");
fprintf(pysam_stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(pysam_stderr, " -h, --header-lines <file> lines which should be appended to the VCF header\n");
args->record_cmd_line = 1;
args->ref_idx = args->alt_idx = args->chr_idx = args->from_idx = args->to_idx = -1;
args->set_ids_replace = 1;
- int regions_is_file = 0;
+ int regions_is_file = 0, collapse = 0;
static struct option loptions[] =
{
{"output-type",required_argument,NULL,'O'},
{"threads",required_argument,NULL,9},
{"annotations",required_argument,NULL,'a'},
+ {"collapse",required_argument,NULL,2},
{"include",required_argument,NULL,'i'},
{"exclude",required_argument,NULL,'e'},
{"regions",required_argument,NULL,'r'},
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
case 'h': args->header_fname = optarg; break;
case 1 : args->rename_chrs = optarg; break;
+ case 2 :
+ if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
+ else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
+ else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
+ else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
+ else if ( !strcmp(optarg,"none") ) collapse = COLLAPSE_NONE;
+ else error("The --collapse string \"%s\" not recognised.\n", optarg);
+ break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 8 : args->record_cmd_line = 0; break;
case '?': usage(args); break;
{
args->tgts_is_vcf = 1;
args->files->require_index = 1;
- args->files->collapse |= COLLAPSE_SOME;
+ args->files->collapse = collapse ? collapse : COLLAPSE_SOME;
}
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
#include <stdarg.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
"* * * F 2\n"
},
{ .alias = "GRCh38",
- .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)",
+ .about = "Human Genome reference assembly GRCh38 / hg38",
.ploidy =
"X 1 9999 M 1\n"
"X 2781480 155701381 M 1\n"
args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
- int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
+ int dflt_sex_id = ploidy_nsex(args->ploidy) - 1;
for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( old2new[ismpl] != -1 ) { fprintf(stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
ss = se+1;
while ( *ss && isspace(*ss) ) ss++;
{
args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
- for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = args->nsex - 1;
}
}
if ( args->nsamples )
{
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
- for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
- for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
+ for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsamples; i++)
+ if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
}
- if ( args->gvcf )
+ if ( args->gvcf )
+ {
+ int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP");
+ if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n");
gvcf_update_header(args->gvcf, args->aux.hdr);
+ }
if ( args->samples_map )
{
else
args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
}
-
int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
}
if ( !pld->alias )
{
- fprintf(stderr,"Predefined ploidies:\n");
+ fprintf(stderr,"\nPRE-DEFINED PLOIDY FILES\n\n");
+ fprintf(stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(stderr," * Coordinates are 1-based inclusive.\n");
+ fprintf(stderr," * A '*' means any value not otherwise defined.\n\n");
pld = ploidy_predefs;
while ( pld->alias )
{
fprintf(stderr, "Input/output options:\n");
fprintf(stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
fprintf(stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
fprintf(stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
fprintf(stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
fprintf(stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
fprintf(stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
fprintf(stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+ fprintf(stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
// todo (and more)
// fprintf(stderr, "\nContrast calling and association test options:\n");
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
+ {"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
case 'i': args.flag |= CF_INS_MISSED; break;
case 'v': args.aux.flag |= CALL_VARONLY; break;
+ case 'F':
+ args.aux.prior_AN = optarg;
+ args.aux.prior_AC = strchr(optarg,',');
+ if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
+ *args.aux.prior_AC = 0;
+ args.aux.prior_AC++;
+ break;
case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
if ( !ploidy_fname && !ploidy )
{
- fprintf(stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
- args.ploidy = ploidy_init_string("",2);
+ if ( !args.samples_is_file ) fprintf(stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n");
+ args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2);
}
if ( !args.ploidy ) error("Could not initialize ploidy\n");
else
ret = ccall(&args.aux, bcf_rec);
if ( ret==-1 ) error("Something is wrong\n");
+ else if ( ret==-2 ) continue; // skip the site
// Normal output
if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
#include <stdarg.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
"* * * F 2\n"
},
{ .alias = "GRCh38",
- .about = "Human Genome reference assembly GRCh38 / hg38, plain chromosome naming (1,2,3,..)",
+ .about = "Human Genome reference assembly GRCh38 / hg38",
.ploidy =
"X 1 9999 M 1\n"
"X 2781480 155701381 M 1\n"
args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
- int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
+ int dflt_sex_id = ploidy_nsex(args->ploidy) - 1;
for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;
int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
if ( ismpl < 0 ) { fprintf(pysam_stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }
+ if ( old2new[ismpl] != -1 ) { fprintf(pysam_stderr,"Warning: The sample is listed multiple times: %s\n",ss); continue; }
ss = se+1;
while ( *ss && isspace(*ss) ) ss++;
{
args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
args->sample2sex = (int*) malloc(sizeof(int)*args->nsamples);
- for (i=0; i<args->nsamples; i++) args->sample2sex[i] = 0;
+ for (i=0; i<args->nsamples; i++) args->sample2sex[i] = args->nsex - 1;
}
}
if ( args->nsamples )
{
args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
- for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
- for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
+ for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = ploidy_max(args->ploidy);
+ for (i=0; i<args->nsamples; i++)
+ if ( args->sample2sex[i] >= args->nsex ) args->sample2sex[i] = args->nsex - 1;
}
- if ( args->gvcf )
+ if ( args->gvcf )
+ {
+ int id = bcf_hdr_id2int(args->aux.hdr,BCF_DT_ID,"DP");
+ if ( id<0 || !bcf_hdr_idinfo_exists(args->aux.hdr,BCF_HL_FMT,id) ) error("--gvcf output mode requires FORMAT/DP tag, which is not present in the input header\n");
gvcf_update_header(args->gvcf, args->aux.hdr);
+ }
if ( args->samples_map )
{
else
args->aux.ploidy[i] = args->sex2ploidy[args->sample2sex[i]];
}
-
int *tmp = args->sex2ploidy; args->sex2ploidy = args->sex2ploidy_prev; args->sex2ploidy_prev = tmp;
}
if ( !pld->alias )
{
- fprintf(pysam_stderr,"Predefined ploidies:\n");
+ fprintf(pysam_stderr,"\nPRE-DEFINED PLOIDY FILES\n\n");
+ fprintf(pysam_stderr," * Columns are: CHROM,FROM,TO,SEX,PLOIDY\n");
+ fprintf(pysam_stderr," * Coordinates are 1-based inclusive.\n");
+ fprintf(pysam_stderr," * A '*' means any value not otherwise defined.\n\n");
pld = ploidy_predefs;
while ( pld->alias )
{
fprintf(pysam_stderr, "Input/output options:\n");
fprintf(pysam_stderr, " -A, --keep-alts keep all possible alternate alleles at variant sites\n");
fprintf(pysam_stderr, " -f, --format-fields <list> output format fields: GQ,GP (lowercase allowed) []\n");
+ fprintf(pysam_stderr, " -F, --prior-freqs <AN,AC> use prior allele frequencies\n");
fprintf(pysam_stderr, " -g, --gvcf <int>,[...] group non-variant sites into gVCF blocks by minimum per-sample DP\n");
fprintf(pysam_stderr, " -i, --insert-missed output also sites missed by mpileup but present in -T\n");
fprintf(pysam_stderr, " -M, --keep-masked-ref keep sites with masked reference allele (REF=N)\n");
fprintf(pysam_stderr, " -m, --multiallelic-caller alternative model for multiallelic and rare-variant calling (conflicts with -c)\n");
fprintf(pysam_stderr, " -n, --novel-rate <float>,[...] likelihood of novel mutation for constrained trio calling, see man page for details [1e-8,1e-9,1e-9]\n");
fprintf(pysam_stderr, " -p, --pval-threshold <float> variant if P(ref|D)<FLOAT with -c [0.5]\n");
- fprintf(pysam_stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity) [1.1e-3]\n");
+ fprintf(pysam_stderr, " -P, --prior <float> mutation rate (use bigger for greater sensitivity), use with -m [1.1e-3]\n");
// todo (and more)
// fprintf(pysam_stderr, "\nContrast calling and association test options:\n");
{
{"help",no_argument,NULL,'h'},
{"format-fields",required_argument,NULL,'f'},
+ {"prior-freqs",required_argument,NULL,'F'},
{"gvcf",required_argument,NULL,'g'},
{"output",required_argument,NULL,'o'},
{"output-type",required_argument,NULL,'O'},
};
char *tmp = NULL;
- while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XY", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:ig:XYF:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
case 'i': args.flag |= CF_INS_MISSED; break;
case 'v': args.aux.flag |= CALL_VARONLY; break;
+ case 'F':
+ args.aux.prior_AN = optarg;
+ args.aux.prior_AC = strchr(optarg,',');
+ if ( !args.aux.prior_AC ) error("Expected two tags with -F (e.g. AN,AC), got \"%s\"\n",optarg);
+ *args.aux.prior_AC = 0;
+ args.aux.prior_AC++;
+ break;
case 'g':
args.gvcf = gvcf_init(optarg);
if ( !args.gvcf ) error("Could not parse: --gvcf %s\n", optarg);
if ( !ploidy_fname && !ploidy )
{
- fprintf(pysam_stderr,"Note: Neither --ploidy nor --ploidy-file given, assuming all sites are diploid\n");
- args.ploidy = ploidy_init_string("",2);
+ if ( !args.samples_is_file ) fprintf(pysam_stderr,"Note: none of --samples-file, --ploidy or --ploidy-file given, assuming all sites are diploid\n");
+ args.ploidy = ploidy_init_string("* * * 0 0\n* * * 1 1\n* * * 2 2\n",2);
}
if ( !args.ploidy ) error("Could not initialize ploidy\n");
else
ret = ccall(&args.aux, bcf_rec);
if ( ret==-1 ) error("Something is wrong\n");
+ else if ( ret==-2 ) continue; // skip the site
// Normal output
if ( (args.aux.flag & CALL_VARONLY) && ret==0 && !args.gvcf ) continue; // not a variant
hmm_init_states(args->hmm, args->iprobs);
args->summary_fh = stdout;
- if ( args->output_dir )
+ init_sample_files(&args->query_sample, args->output_dir);
+ if ( args->control_sample.name )
{
- init_sample_files(&args->query_sample, args->output_dir);
- if ( args->control_sample.name )
- {
- init_sample_files(&args->control_sample, args->output_dir);
- args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
- }
- else
- args->summary_fh = NULL; // one sample only, no two-file summary
+ init_sample_files(&args->control_sample, args->output_dir);
+ args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
}
+ else
+ args->summary_fh = NULL; // one sample only, no two-file summary
+
int i;
FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
"# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n",
args->query_sample.name
);
+ if ( args->optimize_frac )
+ {
+ fprintf(args->query_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ fprintf(args->summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t"
+ "[5]Cell fraction:%s\t[6]Cell fraction:%s\t[7]BAF deviation:%s\t[8]BAF deviation:%s\n",
+ args->query_sample.name,args->control_sample.name,
+ args->query_sample.name,args->control_sample.name
+ );
+ }
+ }
}
char *msprintf(const char *fmt, ...);
free(args->sites);
free(args->eprob);
free(args->tprob);
+ free(args->iprobs);
free(args->summary_fname);
free(args->nonref_afs);
free(args->query_sample.baf);
if ( args->control_sample.name )
fprintf(stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
fprintf(stderr,"\n");
+
+ fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac,sqrt(args->query_sample.baf_dev2));
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->control_sample.cell_frac,sqrt(args->control_sample.baf_dev2));
+ fprintf(args->summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%.2f\t%f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac, args->control_sample.cell_frac,
+ sqrt(args->query_sample.baf_dev2), sqrt(args->control_sample.baf_dev2));
+ }
}
set_emission_probs(args);
else fname = argv[optind];
if ( !fname ) usage(args);
- if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n");
+ if ( !args->output_dir ) error("Expected -o option\n");
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
hmm_init_states(args->hmm, args->iprobs);
args->summary_fh = pysam_stdout;
- if ( args->output_dir )
+ init_sample_files(&args->query_sample, args->output_dir);
+ if ( args->control_sample.name )
{
- init_sample_files(&args->query_sample, args->output_dir);
- if ( args->control_sample.name )
- {
- init_sample_files(&args->control_sample, args->output_dir);
- args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
- }
- else
- args->summary_fh = NULL; // one sample only, no two-file summary
+ init_sample_files(&args->control_sample, args->output_dir);
+ args->summary_fh = open_file(&args->summary_fname,"w","%s/summary.tab",args->output_dir);
}
+ else
+ args->summary_fh = NULL; // one sample only, no two-file summary
+
int i;
FILE *fh = args->summary_fh ? args->summary_fh : args->query_sample.summary_fh;
"# RG, Regions\t[2]Chromosome\t[3]Start\t[4]End\t[5]Copy number:%s\t[6]Quality\t[7]nSites\t[8]nHETs\n",
args->query_sample.name
);
+ if ( args->optimize_frac )
+ {
+ fprintf(args->query_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t[5]Cell fraction\t[6]BAF deviation\n");
+ fprintf(args->summary_fh, "# CF, cell fraction estimate\t[2]Chromosome\t[3]Start\t[4]End\t"
+ "[5]Cell fraction:%s\t[6]Cell fraction:%s\t[7]BAF deviation:%s\t[8]BAF deviation:%s\n",
+ args->query_sample.name,args->control_sample.name,
+ args->query_sample.name,args->control_sample.name
+ );
+ }
+ }
}
char *msprintf(const char *fmt, ...);
free(args->sites);
free(args->eprob);
free(args->tprob);
+ free(args->iprobs);
free(args->summary_fname);
free(args->nonref_afs);
free(args->query_sample.baf);
if ( args->control_sample.name )
fprintf(pysam_stderr,"\t.. %f %f", args->control_sample.cell_frac,args->control_sample.baf_dev2);
fprintf(pysam_stderr,"\n");
+
+ fprintf(args->query_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac,sqrt(args->query_sample.baf_dev2));
+ if ( args->control_sample.name )
+ {
+ fprintf(args->control_sample.summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->control_sample.cell_frac,sqrt(args->control_sample.baf_dev2));
+ fprintf(args->summary_fh,"CF\t%s\t%d\t%d\t%.2f\t%.2f\t%f\t%f\n",
+ bcf_hdr_id2name(args->hdr,args->prev_rid),args->sites[0]+1,args->sites[args->nsites-1]+1,
+ args->query_sample.cell_frac, args->control_sample.cell_frac,
+ sqrt(args->query_sample.baf_dev2), sqrt(args->control_sample.baf_dev2));
+ }
}
set_emission_probs(args);
else fname = argv[optind];
if ( !fname ) usage(args);
- if ( args->plot_th<=100 && !args->output_dir ) error("Expected -o option with -p\n");
+ if ( !args->output_dir ) error("Expected -o option\n");
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
}
}
+int print_vcf_gz_header(BGZF *fp, BGZF *bgzf_out, int print_header, kstring_t *tmp)
+{
+ char *buffer = (char*) fp->uncompressed_block;
+
+ // Read the header and find the position of the data block
+ if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]);
+
+ int nskip = 1; // end of the header in the current uncompressed block
+ while (1)
+ {
+ if ( buffer[nskip]=='\n' )
+ {
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,nskip,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ // The header has finished
+ if ( buffer[nskip]!='#' )
+ {
+ kputsn(buffer,nskip,tmp);
+ break;
+ }
+ }
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,fp->block_length,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ }
+ if ( print_header )
+ {
+ if ( bgzf_write(bgzf_out,tmp->s,tmp->l) != tmp->l ) error("Failed to write %d bytes\n", tmp->l);
+ tmp->l = 0;
+ }
+ return nskip;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+static int check_header(const uint8_t *header)
+{
+ if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+ return ((header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
static void naive_concat(args_t *args)
{
// only compressed BCF atm
BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
- const size_t page_size = 32768;
- char *buf = (char*) malloc(page_size);
+ const size_t page_size = BGZF_MAX_BLOCK_SIZE;
+ uint8_t *buf = (uint8_t*) malloc(page_size);
kstring_t tmp = {0,0,0};
- int i;
+ int i, file_types = 0;
for (i=0; i<args->nfnames; i++)
{
htsFile *hts_fp = hts_open(args->fnames[i],"r");
if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
htsFormat type = *hts_get_format(hts_fp);
- if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
- if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf )
+ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
+ file_types |= type.format==vcf ? 1 : 2;
+ if ( file_types==3 )
+ error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n");
BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
- uint8_t magic[5];
- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+ int nskip;
+ if ( type.format==bcf )
+ {
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- hts_expand(char,tmp.l,tmp.m,tmp.s);
- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- // write only the first header
- if ( i==0 )
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+ nskip = fp->block_offset;
+ }
+ else
{
- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp);
+ if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]);
}
// Output all non-header data that were read together with the header block
- int nskip = fp->block_offset;
if ( fp->block_length - nskip > 0 )
{
- if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
// Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
- ssize_t nread, ncached = 0, nwr;
- const int neof = 28;
- char cached[neof];
+ // The final bgzf eof block will be added by bgzf_close.
+ ssize_t nread, nblock, nwr;
+ const int nheader = 18, neof = 28;
+ const uint8_t *eof = (uint8_t*) "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
while (1)
{
- nread = bgzf_raw_read(fp, buf, page_size);
-
- // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
- if ( nread<=0 ) break;
- if ( nread<=neof ) // last block
- {
- if ( ncached )
- {
- // flush the part of the cache that won't be needed
- nwr = bgzf_raw_write(bgzf_out, cached, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
-
- // make space in the cache so that we can append to the end
- if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
- }
-
- // fill the cache and check for eof outside this loop
- memcpy(cached+neof-nread,buf,nread);
- break;
- }
-
- // not the last block, flush the cache if full
- if ( ncached )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, ncached);
- if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
- ncached = 0;
- }
-
- // fill the cache
- nread -= neof;
- memcpy(cached,buf+nread,neof);
- ncached = neof;
-
+ nread = bgzf_raw_read(fp, buf, nheader);
+ if ( !nread ) break;
+ if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]);
+ nblock = unpackInt16(buf+16) + 1;
+ assert( nblock <= page_size && nblock >= nheader );
+ nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader);
+ if ( nread!=nblock ) error("Could not read %d bytes: %s\n",nblock,args->fnames[i]);
+ if ( nread==neof && !memcmp(buf,eof,neof) ) continue;
nwr = bgzf_raw_write(bgzf_out, buf, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
- }
- if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, neof);
- if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ if ( nwr != nread ) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
}
if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
}
fprintf(stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
- fprintf(stderr, " --no-version do not append version and command line to the header\n");
- fprintf(stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
+ fprintf(stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n");
fprintf(stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
}
}
+int print_vcf_gz_header(BGZF *fp, BGZF *bgzf_out, int print_header, kstring_t *tmp)
+{
+ char *buffer = (char*) fp->uncompressed_block;
+
+ // Read the header and find the position of the data block
+ if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]);
+
+ int nskip = 1; // end of the header in the current uncompressed block
+ while (1)
+ {
+ if ( buffer[nskip]=='\n' )
+ {
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,nskip,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ // The header has finished
+ if ( buffer[nskip]!='#' )
+ {
+ kputsn(buffer,nskip,tmp);
+ break;
+ }
+ }
+ nskip++;
+ if ( nskip>=fp->block_length )
+ {
+ kputsn(buffer,fp->block_length,tmp);
+ if ( bgzf_read_block(fp) != 0 ) return -1;
+ if ( !fp->block_length ) break;
+ nskip = 0;
+ }
+ }
+ if ( print_header )
+ {
+ if ( bgzf_write(bgzf_out,tmp->s,tmp->l) != tmp->l ) error("Failed to write %d bytes\n", tmp->l);
+ tmp->l = 0;
+ }
+ return nskip;
+}
+
+static inline int unpackInt16(const uint8_t *buffer)
+{
+ return buffer[0] | buffer[1] << 8;
+}
+static int check_header(const uint8_t *header)
+{
+ if ( header[0] != 31 || header[1] != 139 || header[2] != 8 ) return -2;
+ return ((header[3] & 4) != 0
+ && unpackInt16((uint8_t*)&header[10]) == 6
+ && header[12] == 'B' && header[13] == 'C'
+ && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
+}
static void naive_concat(args_t *args)
{
// only compressed BCF atm
BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;
- const size_t page_size = 32768;
- char *buf = (char*) malloc(page_size);
+ const size_t page_size = BGZF_MAX_BLOCK_SIZE;
+ uint8_t *buf = (uint8_t*) malloc(page_size);
kstring_t tmp = {0,0,0};
- int i;
+ int i, file_types = 0;
for (i=0; i<args->nfnames; i++)
{
htsFile *hts_fp = hts_open(args->fnames[i],"r");
if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
htsFormat type = *hts_get_format(hts_fp);
- if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
- if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
+ if ( type.compression!=bgzf )
+ error("The --naive option works only for compressed BCFs or VCFs, sorry :-/\n");
+ file_types |= type.format==vcf ? 1 : 2;
+ if ( file_types==3 )
+ error("The --naive option works only for compressed files of the same type, all BCFs or all VCFs :-/\n");
BGZF *fp = hts_get_bgzfp(hts_fp);
if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));
- uint8_t magic[5];
- if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
+ int nskip;
+ if ( type.format==bcf )
+ {
+ uint8_t magic[5];
+ if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);
- if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- hts_expand(char,tmp.l,tmp.m,tmp.s);
- if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
+ hts_expand(char,tmp.l,tmp.m,tmp.s);
+ if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
- // write only the first header
- if ( i==0 )
+ // write only the first header
+ if ( i==0 )
+ {
+ if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
+ if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
+ if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ }
+ nskip = fp->block_offset;
+ }
+ else
{
- if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
- if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
- if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
+ nskip = print_vcf_gz_header(fp, bgzf_out, i==0?1:0, &tmp);
+ if ( nskip==-1 ) error("Error reading %s\n", args->fnames[i]);
}
// Output all non-header data that were read together with the header block
- int nskip = fp->block_offset;
if ( fp->block_length - nskip > 0 )
{
- if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
+ if ( bgzf_write(bgzf_out, (char *)fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
}
if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);
// Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
- ssize_t nread, ncached = 0, nwr;
- const int neof = 28;
- char cached[neof];
+ // The final bgzf eof block will be added by bgzf_close.
+ ssize_t nread, nblock, nwr;
+ const int nheader = 18, neof = 28;
+ const uint8_t *eof = (uint8_t*) "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0";
while (1)
{
- nread = bgzf_raw_read(fp, buf, page_size);
-
- // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
- if ( nread<=0 ) break;
- if ( nread<=neof ) // last block
- {
- if ( ncached )
- {
- // flush the part of the cache that won't be needed
- nwr = bgzf_raw_write(bgzf_out, cached, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
-
- // make space in the cache so that we can append to the end
- if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
- }
-
- // fill the cache and check for eof outside this loop
- memcpy(cached+neof-nread,buf,nread);
- break;
- }
-
- // not the last block, flush the cache if full
- if ( ncached )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, ncached);
- if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
- ncached = 0;
- }
-
- // fill the cache
- nread -= neof;
- memcpy(cached,buf+nread,neof);
- ncached = neof;
-
+ nread = bgzf_raw_read(fp, buf, nheader);
+ if ( !nread ) break;
+ if ( nread != nheader || check_header(buf)!=0 ) error("Could not parse the header of a bgzf block: %s\n",args->fnames[i]);
+ nblock = unpackInt16(buf+16) + 1;
+ assert( nblock <= page_size && nblock >= nheader );
+ nread += bgzf_raw_read(fp, buf+nheader, nblock - nheader);
+ if ( nread!=nblock ) error("Could not read %d bytes: %s\n",nblock,args->fnames[i]);
+ if ( nread==neof && !memcmp(buf,eof,neof) ) continue;
nwr = bgzf_raw_write(bgzf_out, buf, nread);
- if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
- }
- if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
- {
- nwr = bgzf_raw_write(bgzf_out, cached, neof);
- if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
+ if ( nwr != nread ) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
}
if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
}
fprintf(pysam_stderr, " -D, --remove-duplicates Alias for -d none\n");
fprintf(pysam_stderr, " -f, --file-list <file> Read the list of files from a file.\n");
fprintf(pysam_stderr, " -l, --ligate Ligate phased VCFs by matching phase at overlapping haplotypes\n");
- fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
- fprintf(pysam_stderr, " -n, --naive Concatenate BCF files without recompression (dangerous, use with caution)\n");
+ fprintf(pysam_stderr, " --no-version Do not append version and command line to the header\n");
+ fprintf(pysam_stderr, " -n, --naive Concatenate files without recompression (dangerous, use with caution)\n");
fprintf(pysam_stderr, " -o, --output <file> Write output to a file [standard output]\n");
fprintf(pysam_stderr, " -O, --output-type <b|u|z|v> b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]\n");
fprintf(pysam_stderr, " -q, --min-PQ <int> Break phase set if phasing quality is lower than <int> [30]\n");
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
- char *outfname, *infname, *ref_fname;
+ char *outfname, *infname, *ref_fname, *sex_fname;
int argc, n_threads, record_cmd_line;
};
static void open_vcf(args_t *args, const char *format_str)
{
args->files = bcf_sr_init();
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 )
+ error("Could not initialize --threads %d\n", args->n_threads);
+
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
}
if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
free(samples);
-
- if ( args->filter_str )
- args->filter = filter_init(args->header, args->filter_str);
}
static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
nsamples = nrows - 1;
// sample_fname should contain a header line, so need to ignore first row
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
fprintf(stderr,"Number of processed rows: \t%d\n", args->n.total);
}
+char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
+{
+ int i, nlines;
+ char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1);
+ char **lines = hts_readlist(sex_fname, 1, &nlines);
+ if ( !lines ) error("Could not read %s\n", sex_fname);
+ for (i=0; i<nlines; i++)
+ {
+ char *se = lines[i]; while ( *se && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+ int id = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, lines[i]);
+ *se = tmp;
+ if ( id<0 ) continue;
+ while ( *se && isspace(*se) ) se++;
+ if ( *se=='M' ) sample2sex[id] = '1';
+ else if ( *se=='F' ) sample2sex[id] = '2';
+ else error("Could not parse %s: %s\n", sex_fname,lines[i]);
+ }
+ for (i=0; i<nlines; i++) free(lines[i]);
+ free(lines);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
+ return sample2sex;
+}
+
static void vcf_to_gensample(args_t *args)
{
kstring_t str = {0,0,0};
char *gen_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!gen_fname) {
if ( str.m ) free(str.s);
char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]);
+ ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2');
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname && !legend_fname) {
if ( str.m ) free(str.s);
// open haps and legend outputs
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
if (legend_fname) {
str.l = 0;
char *hap_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname) {
if ( str.m ) free(str.s);
// open haps output
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
while ( bcf_sr_next_line(args->files) )
if ( !pass ) continue;
}
- if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") )
+ if (!bcf_has_filter(hdr,line,"PASS"))
+ {
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+
+ // check if alleles compatible with being a gVCF record
+ int i, gallele = -1;
+ if (line->n_allele==1)
+ gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
+ else
+ {
+ if ( line->d.allele[1][0]!='<' ) continue;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
+ if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
+ if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
+ }
+ }
+
+ // no gVCF compatible alleles
+ if (gallele<0)
{
- // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS
bcf_write(out_fh,hdr,line);
continue;
}
int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
if ( nend!=1 )
{
- // No END lineord
+ // No INFO/END => not gVCF record
bcf_write(out_fh,hdr,line);
continue;
}
line->pos = pos;
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1);
- // we have already checked above that there is only one allele,
- // so fine to just update alleles with the ref allele from the fasta
- bcf_update_alleles_str(hdr, line, &ref[0]);
+ strncpy(line->d.allele[0],ref,len);
bcf_write(out_fh,hdr,line);
+ free(ref);
}
}
free(itmp);
fprintf(stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
fprintf(stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
fprintf(stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "gVCF conversion:\n");
fprintf(stderr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(stderr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "HAP/LEGEND/SAMPLE conversion:\n");
fprintf(stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(stderr, "\n");
fprintf(stderr, "TSV conversion:\n");
{"targets-file",required_argument,NULL,'T'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"sex",required_argument,NULL,11},
{"gensample",required_argument,NULL,'g'},
{"gensample2vcf",required_argument,NULL,'G'},
{"tag",required_argument,NULL,1},
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 10 : args->record_cmd_line = 0; break;
+ case 11 : args->sex_fname = optarg; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
int rev_als, output_vcf_ids, hap2dip, output_chrom_first_col;
int nsamples, *samples, sample_is_file, targets_is_file, regions_is_file, output_type;
char **argv, *sample_list, *targets_list, *regions_list, *tag, *columns;
- char *outfname, *infname, *ref_fname;
+ char *outfname, *infname, *ref_fname, *sex_fname;
int argc, n_threads, record_cmd_line;
};
static void open_vcf(args_t *args, const char *format_str)
{
args->files = bcf_sr_init();
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)!=0 )
+ error("Could not initialize --threads %d\n", args->n_threads);
+
if ( args->regions_list )
{
if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
}
if ( format_str ) args->convert = convert_init(args->header, samples, nsamples, format_str);
free(samples);
-
- if ( args->filter_str )
- args->filter = filter_init(args->header, args->filter_str);
}
static int tsv_setter_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
int i, nrows, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nrows);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
nsamples = nrows - 1;
// sample_fname should contain a header line, so need to ignore first row
int i, nsamples;
char **samples = hts_readlist(sample_fname, 1, &nsamples);
+ if ( !samples ) error("Could not read %s\n", sample_fname);
for (i=2; i<nsamples; i++)
{
se = samples[i]; while ( *se && !isspace(*se) ) se++;
fprintf(pysam_stderr,"Number of processed rows: \t%d\n", args->n.total);
}
+char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
+{
+ int i, nlines;
+ char *sample2sex = (char*) calloc(bcf_hdr_nsamples(hdr),1);
+ char **lines = hts_readlist(sex_fname, 1, &nlines);
+ if ( !lines ) error("Could not read %s\n", sex_fname);
+ for (i=0; i<nlines; i++)
+ {
+ char *se = lines[i]; while ( *se && !isspace(*se) ) se++;
+ char tmp = *se;
+ *se = 0;
+ int id = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, lines[i]);
+ *se = tmp;
+ if ( id<0 ) continue;
+ while ( *se && isspace(*se) ) se++;
+ if ( *se=='M' ) sample2sex[id] = '1';
+ else if ( *se=='F' ) sample2sex[id] = '2';
+ else error("Could not parse %s: %s\n", sex_fname,lines[i]);
+ }
+ for (i=0; i<nlines; i++) free(lines[i]);
+ free(lines);
+ for (i=0; i<bcf_hdr_nsamples(hdr); i++)
+ if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
+ return sample2sex;
+}
+
static void vcf_to_gensample(args_t *args)
{
kstring_t str = {0,0,0};
char *gen_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!gen_fname) {
if ( str.m ) free(str.s);
char *hap_fname = NULL, *legend_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s %s 2\n", args->header->samples[i], args->header->samples[i], args->header->samples[i]);
+ ksprintf(&str, "%s %s %s %c\n", args->header->samples[i], args->header->samples[i], args->header->samples[i], sample2sex ? sample2sex[i] : '2');
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname && !legend_fname) {
if ( str.m ) free(str.s);
// open haps and legend outputs
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
BGZF *lout = legend_fname ? bgzf_open(legend_fname, legend_compressed ? "wg" : "wu") : NULL;
if (legend_fname) {
str.l = 0;
char *hap_fname = NULL, *sample_fname = NULL;
str.l = 0;
kputs(args->outfname,&str);
- int n_files, i;
+ int n_files = 0, i;
char **files = hts_readlist(str.s, 0, &n_files);
if ( n_files==1 )
{
if (sample_fname) fprintf(pysam_stderr, "Sample file: %s\n", sample_fname);
// write samples file
- if (sample_fname) {
+ if (sample_fname)
+ {
+ char *sample2sex = NULL;
+ if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
+
int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
- kputs("ID_1 ID_2 missing\n0 0 0\n", &str);
+ kputs(sample2sex ? "ID_1 ID_2 missing sex\n0 0 0 0\n" : "ID_1 ID_2 missing\n0 0 0\n", &str);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
for (i=0; i<bcf_hdr_nsamples(args->header); i++)
{
str.l = 0;
- ksprintf(&str, "%s %s 0\n", args->header->samples[i], args->header->samples[i]);
+ if ( sample2sex )
+ ksprintf(&str, "%s %s 0 %c\n", args->header->samples[i],args->header->samples[i],sample2sex[i]);
+ else
+ ksprintf(&str, "%s %s 0\n", args->header->samples[i],args->header->samples[i]);
ret = bgzf_write(sout, str.s, str.l);
if ( ret != str.l ) error("Error writing %s: %s\n", sample_fname, strerror(errno));
}
if ( bgzf_close(sout)!=0 ) error("Error closing %s: %s\n", sample_fname, strerror(errno));
free(sample_fname);
+ free(sample2sex);
}
if (!hap_fname) {
if ( str.m ) free(str.s);
// open haps output
BGZF *hout = hap_fname ? bgzf_open(hap_fname, hap_compressed ? "wg" : "wu") : NULL;
+ if ( hap_compressed && args->n_threads ) bgzf_thread_pool(hout, args->files->p->pool, args->files->p->qsize);
int no_alt = 0, non_biallelic = 0, filtered = 0, nok = 0;
while ( bcf_sr_next_line(args->files) )
if ( !pass ) continue;
}
- if ( line->n_allele!=1 || !bcf_has_filter(hdr,line,"PASS") )
+ if (!bcf_has_filter(hdr,line,"PASS"))
+ {
+ bcf_write(out_fh,hdr,line);
+ continue;
+ }
+
+ // check if alleles compatible with being a gVCF record
+ int i, gallele = -1;
+ if (line->n_allele==1)
+ gallele = 0; // illumina/bcftools-call gvcf (if INFO/END present)
+ else
+ {
+ if ( line->d.allele[1][0]!='<' ) continue;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( line->d.allele[i][1]=='*' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // mpileup/spec compliant gVCF
+ if ( line->d.allele[i][1]=='X' && line->d.allele[i][2]=='>' && line->d.allele[i][3]=='\0' ) { gallele = i; break; } // old mpileup gVCF
+ if ( strcmp(line->d.allele[i],"<NON_REF>")==0 ) { gallele = i; break; } // GATK gVCF
+ }
+ }
+
+ // no gVCF compatible alleles
+ if (gallele<0)
{
- // Assuming that only ALT=. sites can be blocks and skipping sites which don't PASS
bcf_write(out_fh,hdr,line);
continue;
}
int nend = bcf_get_info_int32(hdr,line,"END",&itmp,&nitmp);
if ( nend!=1 )
{
- // No END lineord
+ // No INFO/END => not gVCF record
bcf_write(out_fh,hdr,line);
continue;
}
line->pos = pos;
char *ref = faidx_fetch_seq(args->ref, (char*)bcf_hdr_id2name(hdr,line->rid), line->pos, line->pos, &len);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", bcf_hdr_id2name(hdr,line->rid), line->pos+1);
- // we have already checked above that there is only one allele,
- // so fine to just update alleles with the ref allele from the fasta
- bcf_update_alleles_str(hdr, line, &ref[0]);
+ strncpy(line->d.allele[0],ref,len);
bcf_write(out_fh,hdr,line);
+ free(ref);
}
}
free(itmp);
fprintf(pysam_stderr, " -g, --gensample <...> <prefix>|<gen-file>,<sample-file>\n");
fprintf(pysam_stderr, " --tag <string> tag to take values for .gen file: GT,PL,GL,GP [GT]\n");
fprintf(pysam_stderr, " --chrom output chromosome in first column instead of CHROM:POS_REF_ALT\n");
+ fprintf(pysam_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(pysam_stderr, " --vcf-ids output VCF IDs in second column instead of CHROM:POS_REF_ALT\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "gVCF conversion:\n");
fprintf(pysam_stderr, " --hapsample2vcf <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(pysam_stderr, " --hapsample <...> <prefix>|<haps-file>,<sample-file>\n");
fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysam_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "HAP/LEGEND/SAMPLE conversion:\n");
fprintf(pysam_stderr, " -H, --haplegendsample2vcf <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(pysam_stderr, " -h, --haplegendsample <...> <prefix>|<hap-file>,<legend-file>,<sample-file>\n");
fprintf(pysam_stderr, " --haploid2diploid convert haploid genotypes to diploid homozygotes\n");
+ fprintf(pysam_stderr, " --sex <file> output sex column in the sample-file, input format is: Sample\\t[MF]\n");
fprintf(pysam_stderr, " --vcf-ids output VCF IDs instead of CHROM:POS_REF_ALT\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "TSV conversion:\n");
{"targets-file",required_argument,NULL,'T'},
{"samples",required_argument,NULL,'s'},
{"samples-file",required_argument,NULL,'S'},
+ {"sex",required_argument,NULL,11},
{"gensample",required_argument,NULL,'g'},
{"gensample2vcf",required_argument,NULL,'G'},
{"tag",required_argument,NULL,1},
case 'h': args->convert_func = vcf_to_haplegendsample; args->outfname = optarg; break;
case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 10 : args->record_cmd_line = 0; break;
+ case 11 : args->sex_fname = optarg; break;
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
}
if ( tmp.s ) kputs(" and ", &tmp);
kputs("\"IndelGap\"", &tmp);
}
- fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) )
+ fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
free(tmp.s);
}
if ( tmp.s ) kputs(" and ", &tmp);
kputs("\"IndelGap\"", &tmp);
}
- fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
+ if ( strncmp(tmp.s+1,args->soft_filter,tmp.l-2) )
+ fprintf(pysam_stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
free(tmp.s);
}
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <inttypes.h>
#include "bcftools.h"
+#include "hclust.h"
typedef struct
{
bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
int ntmp_arr, npl_arr;
int32_t *tmp_arr, *pl_arr;
- double *lks, *sites;
+ double *lks, *sites, min_inter_err, max_intra_err;
int *cnts, *dps, hom_only, cross_check, all_sites;
char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
- int argc, no_PLs;
+ int argc, no_PLs, narr, nsmpl;
}
args_t;
free(fname);
}
+#if 0
static void plot_cross_check(args_t *args)
{
char *fname;
py_plot(fname);
free(fname);
}
+#endif
static void init_data(args_t *args)
{
args->sites = (double*) calloc(nsamples,sizeof(double));
args->dps = (int*) calloc(nsamples,sizeof(int));
}
- else
- {
- int nsamples = bcf_hdr_nsamples(args->sm_hdr);
- int narr = (nsamples-1)*nsamples/2;
- args->lks = (double*) calloc(narr,sizeof(double));
- args->cnts = (int*) calloc(narr,sizeof(int));
- args->dps = (int*) calloc(narr,sizeof(int));
- }
}
static void destroy_data(args_t *args)
}
}
-static inline int is_hom_most_likely(int nals, int *pls)
+// static inline int is_hom_most_likely(int nals, int *pls)
+// {
+// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
+// for (ia=1; ia<nals; ia++)
+// {
+// for (ib=0; ib<ia; ib++)
+// {
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+// idx++;
+// }
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
+// idx++;
+// }
+// return min_is_hom;
+// }
+
+int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
+{
+ int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+
+ if ( ngt<=0 ) return 1; // GT not present
+ if ( ngt!=args->nsmpl*2 ) return 2; // not diploid
+ ngt /= args->nsmpl;
+
+ int i,j, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
+ {
+ int32_t *a = args->tmp_arr + i*ngt;
+ if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
+ int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+
+ for (j=0; j<i; j++)
+ {
+ int32_t *b = args->tmp_arr + j*ngt;
+ if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
+ int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+
+ ntot[idx]++;
+ if ( agt!=bgt ) ndif[idx]++;
+ idx++;
+ }
+ }
+ return 0;
+}
+int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
- int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
- for (ia=1; ia<nals; ia++)
+ int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
+
+ if ( npl<=0 ) return 1; // PL not present
+ npl /= args->nsmpl;
+
+ int i,j,k, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
{
- for (ib=0; ib<ia; ib++)
+ int32_t *a = args->tmp_arr + i*npl;
+ int imin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( a[k]==bcf_int32_vector_end ) break;
+ if ( a[k]==bcf_int32_missing ) continue;
+ if ( imin==-1 || a[imin] > a[k] ) imin = k;
+ }
+ if ( imin<0 ) { idx+=i; continue; }
+
+ for (j=0; j<i; j++)
{
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+ int32_t *b = args->tmp_arr + j*npl;
+ int jmin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( b[k]==bcf_int32_vector_end ) break;
+ if ( b[k]==bcf_int32_missing ) continue;
+ if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+ }
+ if ( jmin<0 ) { idx++; continue; }
+
+ ntot[idx]++;
+ if ( imin!=jmin ) ndif[idx]++;
idx++;
}
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
- idx++;
}
- return min_is_hom;
+ return 0;
}
static void cross_check_gts(args_t *args)
{
- int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
- unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
- int fake_pls = args->no_PLs, ignore_dp = 0;
-
- int i,j,k,idx, pl_warned = 0, dp_warned = 0;
- int32_t *dp_arr = NULL;
- int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
+ // Initialize things: check which tags are defined in the header, sample names etc.
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
{
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs )
+ if ( !args->no_PLs ) {
fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- fake_pls = 1;
+ args->no_PLs = 99;
+ }
}
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
- print_header(args, fp);
- if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
+ args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
+ args->narr = (args->nsmpl-1)*args->nsmpl/2;
+
+ uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
+ uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
while ( bcf_sr_next_line(args->files) )
{
- bcf1_t *line = args->files->readers[0].buffer[0];
- bcf_unpack(line, BCF_UN_FMT);
-
- int npl;
- if ( !fake_pls )
- {
- npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
- if ( npl<=0 ) { pl_warned++; continue; }
- npl /= nsamples;
- }
- else
- npl = fake_PLs(args, args->sm_hdr, line);
- int mdp = 0;
- if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++;
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
- if ( args->hom_only )
+ // use PLs unless no_PLs is set and GT exists
+ if ( args->no_PLs )
{
- for (i=0; i<nsamples; i++)
- is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
+ if ( process_GT(args,line,ntot,ndif)==0 ) continue;
}
-
- double sum = 0; int nsum = 0;
- idx = 0;
- for (i=0; i<nsamples; i++)
- {
- int *ipl = &args->pl_arr[i*npl];
- if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
- if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }
-
- for (j=0; j<i; j++)
- {
- int *jpl = &args->pl_arr[j*npl];
- if ( *jpl==-1 ) { idx++; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
- if ( args->hom_only && !is_hom[j] ) { idx++; continue; }
-
- int min_pl = INT_MAX;
- for (k=0; k<npl; k++)
- {
- if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
- if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
- if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
- }
- if ( k!=npl ) { idx++; continue; }
-
- if ( args->all_sites ) { sum += min_pl; nsum++; }
- args->lks[idx] += min_pl;
- args->cnts[idx]++;
-
- if ( mdp>0 )
- {
- args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
- dp[i] += dp_arr[i]; ndp[i]++;
- dp[j] += dp_arr[j]; ndp[j]++;
- }
- else
- {
- args->dps[idx]++;
- dp[i]++; ndp[i]++;
- dp[j]++; ndp[j]++;
- }
- idx++;
- }
- }
- if ( args->all_sites )
- fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
+ process_PL(args,line,ntot,ndif);
}
- if ( dp_arr ) free(dp_arr);
- if ( args->pl_arr ) free(args->pl_arr);
- if ( args->tmp_arr ) free(args->tmp_arr);
- if ( is_hom ) free(is_hom);
+
+ FILE *fp = stdout;
+ print_header(args, fp);
- if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
- if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+ float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
- // Output samples sorted by average discordance
- double *score = (double*) calloc(nsamples,sizeof(double));
- args->sites = (double*) calloc(nsamples,sizeof(double));
- idx = 0;
- for (i=0; i<nsamples; i++)
+ // Output pairwise distances
+ fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
+ int i,j, idx = 0;
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- score[i] += args->lks[idx];
- score[j] += args->lks[idx];
- args->sites[i] += args->cnts[idx];
- args->sites[j] += args->cnts[idx];
+ float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
+ fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ PDIST(tmp,i,j) = err;
idx++;
}
}
- for (i=0; i<nsamples; i++)
- if ( args->sites[i] ) score[i] /= args->sites[i];
- double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
- for (i=0; i<nsamples; i++) p[i] = &score[i];
- qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
- // The average discordance gives the number of differing sites in % with -G1
- fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
- for (i=0; i<nsamples; i++)
+
+ // Cluster samples
+ int nlist;
+ float clust_max_err = args->max_intra_err;
+ hclust_t *clust = hclust_init(args->nsmpl,tmp);
+ cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
+ fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
+ for (i=0; i<nlist; i++)
{
- idx = p[i] - score;
- double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
- double nsites = args->sites[idx]/(nsamples-1);
- avg_score += score[idx];
- fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
+ fprintf(fp,"CLUSTER\t%f", list[i].dist);
+ for (j=0; j<list[i].nmemb; j++)
+ fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
+ fprintf(fp,"\n");
}
-
- // // Overall score: maximum absolute deviation from the average score
- // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
- // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set
- free(p);
- free(score);
- free(dp);
- free(ndp);
-
- // Pairwise discordances
+ hclust_destroy_list(list,nlist);
+ // Debugging output: the cluster graph and data used for deciding
+ char **dbg = hclust_explain(clust,&nlist);
+ for (i=0; i<nlist; i++)
+ fprintf(fp,"DBG\t%s\n", dbg[i]);
+ fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
+ fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
+ fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
+ hclust_destroy(clust);
+ free(tmp);
+
+
+ // Deprecated output for temporary backward compatibility
+ fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
idx = 0;
- for (i=0; i<nsamples; i++)
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0,
- args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
idx++;
}
}
- fclose(fp);
- if ( args->plot )
- plot_cross_check(args);
+
+ free(ndif);
+ free(ntot);
+ free(args->tmp_arr);
}
static char *init_prefix(char *prefix)
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(stderr, " -c, --cluster <min,max> min inter- and max intra-sample error [0.23,-0.3]\n");
fprintf(stderr, " -g, --genotypes <file> genotypes to compare against\n");
fprintf(stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
fprintf(stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
char *regions = NULL, *targets = NULL;
int regions_is_file = 0, targets_is_file = 0;
+ // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
+ // - min_inter: pairs with smaller err value will be considered identical
+ // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
+ // different. If negative, the cutoff may be heuristically lowered
+ args->min_inter_err = 0.23;
+ args->max_intra_err = -0.3;
+
static struct option loptions[] =
{
+ {"cluster",1,0,'c'},
{"GTs-only",1,0,'G'},
{"all-sites",0,0,'a'},
{"homs-only",0,0,'H'},
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'c':
+ args->min_inter_err = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -c %s\n", optarg);
+ args->max_intra_err = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -c %s\n", optarg);
+ }
+ break;
case 'G':
args->no_PLs = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <inttypes.h>
#include "bcftools.h"
+#include "hclust.h"
typedef struct
{
bcf_hdr_t *gt_hdr, *sm_hdr; // VCF with genotypes to compare against and the query VCF
int ntmp_arr, npl_arr;
int32_t *tmp_arr, *pl_arr;
- double *lks, *sites;
+ double *lks, *sites, min_inter_err, max_intra_err;
int *cnts, *dps, hom_only, cross_check, all_sites;
char *cwd, **argv, *gt_fname, *plot, *query_sample, *target_sample;
- int argc, no_PLs;
+ int argc, no_PLs, narr, nsmpl;
}
args_t;
free(fname);
}
+#if 0
static void plot_cross_check(args_t *args)
{
char *fname;
py_plot(fname);
free(fname);
}
+#endif
static void init_data(args_t *args)
{
args->sites = (double*) calloc(nsamples,sizeof(double));
args->dps = (int*) calloc(nsamples,sizeof(int));
}
- else
- {
- int nsamples = bcf_hdr_nsamples(args->sm_hdr);
- int narr = (nsamples-1)*nsamples/2;
- args->lks = (double*) calloc(narr,sizeof(double));
- args->cnts = (int*) calloc(narr,sizeof(int));
- args->dps = (int*) calloc(narr,sizeof(int));
- }
}
static void destroy_data(args_t *args)
}
}
-static inline int is_hom_most_likely(int nals, int *pls)
+// static inline int is_hom_most_likely(int nals, int *pls)
+// {
+// int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
+// for (ia=1; ia<nals; ia++)
+// {
+// for (ib=0; ib<ia; ib++)
+// {
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+// idx++;
+// }
+// if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
+// idx++;
+// }
+// return min_is_hom;
+// }
+
+int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
+{
+ int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);
+
+ if ( ngt<=0 ) return 1; // GT not present
+ if ( ngt!=args->nsmpl*2 ) return 2; // not diploid
+ ngt /= args->nsmpl;
+
+ int i,j, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
+ {
+ int32_t *a = args->tmp_arr + i*ngt;
+ if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
+ int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);
+
+ for (j=0; j<i; j++)
+ {
+ int32_t *b = args->tmp_arr + j*ngt;
+ if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
+ int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);
+
+ ntot[idx]++;
+ if ( agt!=bgt ) ndif[idx]++;
+ idx++;
+ }
+ }
+ return 0;
+}
+int process_PL(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
- int ia, ib, idx = 1, min_is_hom = 1, min_pl = pls[0];
- for (ia=1; ia<nals; ia++)
+ int npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->tmp_arr, &args->ntmp_arr);
+
+ if ( npl<=0 ) return 1; // PL not present
+ npl /= args->nsmpl;
+
+ int i,j,k, idx = 0;
+ for (i=1; i<args->nsmpl; i++)
{
- for (ib=0; ib<ia; ib++)
+ int32_t *a = args->tmp_arr + i*npl;
+ int imin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( a[k]==bcf_int32_vector_end ) break;
+ if ( a[k]==bcf_int32_missing ) continue;
+ if ( imin==-1 || a[imin] > a[k] ) imin = k;
+ }
+ if ( imin<0 ) { idx+=i; continue; }
+
+ for (j=0; j<i; j++)
{
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 0; }
+ int32_t *b = args->tmp_arr + j*npl;
+ int jmin = -1;
+ for (k=0; k<npl; k++)
+ {
+ if ( b[k]==bcf_int32_vector_end ) break;
+ if ( b[k]==bcf_int32_missing ) continue;
+ if ( jmin==-1 || b[jmin] > b[k] ) jmin = k;
+ }
+ if ( jmin<0 ) { idx++; continue; }
+
+ ntot[idx]++;
+ if ( imin!=jmin ) ndif[idx]++;
idx++;
}
- if ( pls[idx] < min_pl ) { min_pl = pls[idx]; min_is_hom = 1; }
- idx++;
}
- return min_is_hom;
+ return 0;
}
static void cross_check_gts(args_t *args)
{
- int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
- unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
- int fake_pls = args->no_PLs, ignore_dp = 0;
-
- int i,j,k,idx, pl_warned = 0, dp_warned = 0;
- int32_t *dp_arr = NULL;
- int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
+ // Initialize things: check which tags are defined in the header, sample names etc.
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
{
if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
- if ( !args->no_PLs )
+ if ( !args->no_PLs ) {
fprintf(pysam_stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
- fake_pls = 1;
+ args->no_PLs = 99;
+ }
}
- if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;
- FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : pysam_stdout;
- print_header(args, fp);
- if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");
+ args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
+ args->narr = (args->nsmpl-1)*args->nsmpl/2;
+
+ uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
+ uint32_t *ntot = (uint32_t*) calloc(args->narr,4);
while ( bcf_sr_next_line(args->files) )
{
- bcf1_t *line = args->files->readers[0].buffer[0];
- bcf_unpack(line, BCF_UN_FMT);
-
- int npl;
- if ( !fake_pls )
- {
- npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
- if ( npl<=0 ) { pl_warned++; continue; }
- npl /= nsamples;
- }
- else
- npl = fake_PLs(args, args->sm_hdr, line);
- int mdp = 0;
- if ( !ignore_dp && (mdp=bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr)) <= 0 ) dp_warned++;
+ bcf1_t *line = bcf_sr_get_line(args->files,0);
- if ( args->hom_only )
+ // use PLs unless no_PLs is set and GT exists
+ if ( args->no_PLs )
{
- for (i=0; i<nsamples; i++)
- is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
+ if ( process_GT(args,line,ntot,ndif)==0 ) continue;
}
-
- double sum = 0; int nsum = 0;
- idx = 0;
- for (i=0; i<nsamples; i++)
- {
- int *ipl = &args->pl_arr[i*npl];
- if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
- if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }
-
- for (j=0; j<i; j++)
- {
- int *jpl = &args->pl_arr[j*npl];
- if ( *jpl==-1 ) { idx++; continue; } // missing genotype
- if ( mdp>0 && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
- if ( args->hom_only && !is_hom[j] ) { idx++; continue; }
-
- int min_pl = INT_MAX;
- for (k=0; k<npl; k++)
- {
- if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
- if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
- if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
- }
- if ( k!=npl ) { idx++; continue; }
-
- if ( args->all_sites ) { sum += min_pl; nsum++; }
- args->lks[idx] += min_pl;
- args->cnts[idx]++;
-
- if ( mdp>0 )
- {
- args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
- dp[i] += dp_arr[i]; ndp[i]++;
- dp[j] += dp_arr[j]; ndp[j]++;
- }
- else
- {
- args->dps[idx]++;
- dp[i]++; ndp[i]++;
- dp[j]++; ndp[j]++;
- }
- idx++;
- }
- }
- if ( args->all_sites )
- fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
+ process_PL(args,line,ntot,ndif);
}
- if ( dp_arr ) free(dp_arr);
- if ( args->pl_arr ) free(args->pl_arr);
- if ( args->tmp_arr ) free(args->tmp_arr);
- if ( is_hom ) free(is_hom);
+
+ FILE *fp = pysam_stdout;
+ print_header(args, fp);
- if ( pl_warned ) fprintf(pysam_stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
- if ( dp_warned ) fprintf(pysam_stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);
+ float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);
- // Output samples sorted by average discordance
- double *score = (double*) calloc(nsamples,sizeof(double));
- args->sites = (double*) calloc(nsamples,sizeof(double));
- idx = 0;
- for (i=0; i<nsamples; i++)
+ // Output pairwise distances
+ fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
+ int i,j, idx = 0;
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- score[i] += args->lks[idx];
- score[j] += args->lks[idx];
- args->sites[i] += args->cnts[idx];
- args->sites[j] += args->cnts[idx];
+ float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
+ fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ PDIST(tmp,i,j) = err;
idx++;
}
}
- for (i=0; i<nsamples; i++)
- if ( args->sites[i] ) score[i] /= args->sites[i];
- double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
- for (i=0; i<nsamples; i++) p[i] = &score[i];
- qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
- // The average discordance gives the number of differing sites in % with -G1
- fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
- for (i=0; i<nsamples; i++)
+
+ // Cluster samples
+ int nlist;
+ float clust_max_err = args->max_intra_err;
+ hclust_t *clust = hclust_init(args->nsmpl,tmp);
+ cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
+ fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
+ for (i=0; i<nlist; i++)
{
- idx = p[i] - score;
- double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
- double nsites = args->sites[idx]/(nsamples-1);
- avg_score += score[idx];
- fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
+ fprintf(fp,"CLUSTER\t%f", list[i].dist);
+ for (j=0; j<list[i].nmemb; j++)
+ fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]);
+ fprintf(fp,"\n");
}
-
- // // Overall score: maximum absolute deviation from the average score
- // fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
- // fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set
- free(p);
- free(score);
- free(dp);
- free(ndp);
-
- // Pairwise discordances
+ hclust_destroy_list(list,nlist);
+ // Debugging output: the cluster graph and data used for deciding
+ char **dbg = hclust_explain(clust,&nlist);
+ for (i=0; i<nlist; i++)
+ fprintf(fp,"DBG\t%s\n", dbg[i]);
+ fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
+ fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
+ fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));
+ hclust_destroy(clust);
+ free(tmp);
+
+
+ // Deprecated output for temporary backward compatibility
+ fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
idx = 0;
- for (i=0; i<nsamples; i++)
+ for (i=0; i<args->nsmpl; i++)
{
for (j=0; j<i; j++)
{
- fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0,
- args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
+ fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
idx++;
}
}
- fclose(fp);
- if ( args->plot )
- plot_cross_check(args);
+
+ free(ndif);
+ free(ntot);
+ free(args->tmp_arr);
}
static char *init_prefix(char *prefix)
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Options:\n");
fprintf(pysam_stderr, " -a, --all-sites output comparison for all sites\n");
+ fprintf(pysam_stderr, " -c, --cluster <min,max> min inter- and max intra-sample error [0.23,-0.3]\n");
fprintf(pysam_stderr, " -g, --genotypes <file> genotypes to compare against\n");
fprintf(pysam_stderr, " -G, --GTs-only <int> use GTs, ignore PLs, using <int> for unseen genotypes [99]\n");
fprintf(pysam_stderr, " -H, --homs-only homozygous genotypes only (useful for low coverage data)\n");
char *regions = NULL, *targets = NULL;
int regions_is_file = 0, targets_is_file = 0;
+ // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23
+ // - min_inter: pairs with smaller err value will be considered identical
+ // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered
+ // different. If negative, the cutoff may be heuristically lowered
+ args->min_inter_err = 0.23;
+ args->max_intra_err = -0.3;
+
static struct option loptions[] =
{
+ {"cluster",1,0,'c'},
{"GTs-only",1,0,'G'},
{"all-sites",0,0,'a'},
{"homs-only",0,0,'H'},
{0,0,0,0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) {
switch (c) {
+ case 'c':
+ args->min_inter_err = strtod(optarg,&tmp);
+ if ( *tmp )
+ {
+ if ( *tmp!=',') error("Could not parse: -c %s\n", optarg);
+ args->max_intra_err = strtod(tmp+1,&tmp);
+ if ( *tmp ) error("Could not parse: -c %s\n", optarg);
+ }
+ break;
case 'G':
args->no_PLs = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg);
-
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
Copyright (C) 2014-2016 Genome Research Ltd.
#include <sys/stat.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
+#include <htslib/kstring.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
fprintf(stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
fprintf(stderr, "\n");
fprintf(stderr, "Indexing options:\n");
- fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
- fprintf(stderr, " -f, --force overwrite index if it already exists\n");
- fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(stderr, " -f, --force overwrite index if it already exists\n");
+ fprintf(stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(stderr, " -o, --output-file FILE optional output index file name\n");
+ fprintf(stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(stderr, " --threads sets the number of threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Stats options:\n");
fprintf(stderr, " -n, --nrecords print number of records based on existing index file\n");
- fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(stderr, "\n");
exit(1);
}
int vcf_index_stats(char *fname, int stats)
{
- char *fn_out = NULL;
- FILE *out;
- out = fn_out ? fopen(fn_out, "w") : stdout;
-
const char **seq;
int i, nseq;
tbx_t *tbx = NULL;
if ( hts_get_format(fp)->format==vcf )
{
tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
+ if ( !tbx ) { fprintf(stderr,"Could not load index for VCF: %s\n", fname); return 1; }
}
else if ( hts_get_format(fp)->format==bcf )
{
idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
+ if ( !idx ) { fprintf(stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
}
else
{
if (stats&2 || !records) continue;
bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
+ printf("%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
}
if (!sum)
{
bcf1_t *rec = bcf_init1();
if (bcf_read1(fp, hdr, rec) >= 0)
{
- fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ fprintf(stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
return 1;
}
bcf_destroy1(rec);
}
- if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
+ if (stats&2) printf("%" PRIu64 "\n", sum);
free(seq);
- fclose(out);
hts_close(fp);
bcf_hdr_destroy(hdr);
if (tbx)
int main_vcfindex(int argc, char *argv[])
{
- int c, force = 0, tbi = 0, stats = 0;
+ int c, force = 0, tbi = 0, stats = 0, n_threads = 0;
int min_shift = BCF_LIDX_SHIFT;
+ char *outfn = NULL;
static struct option loptions[] =
{
{"min-shift",required_argument,NULL,'m'},
{"stats",no_argument,NULL,'s'},
{"nrecords",no_argument,NULL,'n'},
+ {"threads",required_argument,NULL,9},
+ {"output-file",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'c': tbi = 0; break;
case 't': tbi = 1; min_shift = 0; break;
case 'f': force = 1; break;
- case 'm':
+ case 'm':
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
break;
case 's': stats |= 1; break;
case 'n': stats |= 2; break;
+ case 9:
+ n_threads = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
+ break;
+ case 'o': outfn = optarg; break;
default: usage();
}
}
- if ( optind==argc ) usage();
if (stats>2)
{
fprintf(stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
return 1;
}
- char *fname = argv[optind];
- if (stats) return vcf_index_stats(fname, stats);
-
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) error("Failed to read %s\n", fname);
- htsFormat type = *hts_get_format(fp);
- hts_close(fp);
-
- if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
+ char *fname = NULL;
+ if ( optind>=argc )
{
- fprintf(stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
- if ( type.compression!=bgzf )
- fprintf(stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
- return 1;
- }
- if (tbi && type.format==bcf)
- {
- fprintf(stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
- tbi = 0; min_shift = BCF_LIDX_SHIFT;
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
}
- if (min_shift == 0 && type.format==bcf)
- {
- fprintf(stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
- return 1;
- }
- if (!tbi && type.format==vcf && min_shift == 0)
+ else fname = argv[optind];
+ if (stats) return vcf_index_stats(fname, stats);
+
+ kstring_t idx_fname = {0,0,0};
+ if (outfn)
+ kputs(outfn,&idx_fname);
+ else
{
- fprintf(stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
- tbi = 1;
+ if (!strcmp(fname, "-")) { fprintf(stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; }
+ ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi");
}
-
if (!force)
{
// Before complaining about existing index, check if the VCF file isn't newer.
- char *idx_fname = (char*)alloca(strlen(fname) + 5);
- strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
struct stat stat_tbi, stat_file;
- if ( stat(idx_fname, &stat_tbi)==0 )
+ if ( stat(idx_fname.s, &stat_tbi)==0 )
{
stat(fname, &stat_file);
if ( stat_file.st_mtime <= stat_tbi.st_mtime )
{
- fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ fprintf(stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s);
+ free(idx_fname.s);
return 1;
}
}
}
- if (type.format==bcf)
- {
- if ( bcf_index_build(fname, min_shift) != 0 )
- {
- fprintf(stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
- return 1;
- }
- }
- else
- {
- if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
- {
- fprintf(stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
- return 1;
- }
+ int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
+ free(idx_fname.s);
+ if (ret != 0) {
+ if (ret == -2)
+ error("index: failed to open \"%s\"\n", fname);
+ else if (ret == -3)
+ error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname);
+ else
+ error("index: failed to create index for \"%s\"\n", fname);
}
return 0;
}
#include "pysam.h"
-
/* vcfindex.c -- Index bgzip compressed VCF/BCF files for random access.
Copyright (C) 2014-2016 Genome Research Ltd.
#include <sys/stat.h>
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
+#include <htslib/kstring.h>
#include "bcftools.h"
#define BCF_LIDX_SHIFT 14
fprintf(pysam_stderr, "Usage: bcftools index [options] <in.bcf>|<in.vcf.gz>\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Indexing options:\n");
- fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
- fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n");
- fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
- fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(pysam_stderr, " -c, --csi generate CSI-format index for VCF/BCF files [default]\n");
+ fprintf(pysam_stderr, " -f, --force overwrite index if it already exists\n");
+ fprintf(pysam_stderr, " -m, --min-shift INT set minimal interval size for CSI indices to 2^INT [14]\n");
+ fprintf(pysam_stderr, " -o, --output-file FILE optional output index file name\n");
+ fprintf(pysam_stderr, " -t, --tbi generate TBI-format index for VCF files\n");
+ fprintf(pysam_stderr, " --threads sets the number of threads [0]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Stats options:\n");
fprintf(pysam_stderr, " -n, --nrecords print number of records based on existing index file\n");
- fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n");
+ fprintf(pysam_stderr, " -s, --stats print per contig stats based on existing index file\n");
fprintf(pysam_stderr, "\n");
exit(1);
}
int vcf_index_stats(char *fname, int stats)
{
- char *fn_out = NULL;
- FILE *out;
- out = fn_out ? fopen(fn_out, "w") : pysam_stdout;
-
const char **seq;
int i, nseq;
tbx_t *tbx = NULL;
if ( hts_get_format(fp)->format==vcf )
{
tbx = tbx_index_load(fname);
- if ( !tbx ) { fprintf(pysam_stderr,"Could not load TBI index: %s\n", fname); return 1; }
+ if ( !tbx ) { fprintf(pysam_stderr,"Could not load index for VCF: %s\n", fname); return 1; }
}
else if ( hts_get_format(fp)->format==bcf )
{
idx = bcf_index_load(fname);
- if ( !idx ) { fprintf(pysam_stderr,"Could not load CSI index: %s\n", fname); return 1; }
+ if ( !idx ) { fprintf(pysam_stderr,"Could not load index for BCF file: %s\n", fname); return 1; }
}
else
{
if (stats&2 || !records) continue;
bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
- fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
+ fprintf(pysam_stdout, "%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
}
if (!sum)
{
bcf1_t *rec = bcf_init1();
if (bcf_read1(fp, hdr, rec) >= 0)
{
- fprintf(pysam_stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
+ fprintf(pysam_stderr,"index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", fname);
return 1;
}
bcf_destroy1(rec);
}
- if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
+ if (stats&2) fprintf(pysam_stdout, "%" PRIu64 "\n", sum);
free(seq);
- fclose(out);
hts_close(fp);
bcf_hdr_destroy(hdr);
if (tbx)
int main_vcfindex(int argc, char *argv[])
{
- int c, force = 0, tbi = 0, stats = 0;
+ int c, force = 0, tbi = 0, stats = 0, n_threads = 0;
int min_shift = BCF_LIDX_SHIFT;
+ char *outfn = NULL;
static struct option loptions[] =
{
{"min-shift",required_argument,NULL,'m'},
{"stats",no_argument,NULL,'s'},
{"nrecords",no_argument,NULL,'n'},
+ {"threads",required_argument,NULL,9},
+ {"output-file",required_argument,NULL,'o'},
{NULL, 0, NULL, 0}
};
char *tmp;
- while ((c = getopt_long(argc, argv, "ctfm:sn", loptions, NULL)) >= 0)
+ while ((c = getopt_long(argc, argv, "ctfm:sno:", loptions, NULL)) >= 0)
{
switch (c)
{
case 'c': tbi = 0; break;
case 't': tbi = 1; min_shift = 0; break;
case 'f': force = 1; break;
- case 'm':
+ case 'm':
min_shift = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-shift %s\n", optarg);
break;
case 's': stats |= 1; break;
case 'n': stats |= 2; break;
+ case 9:
+ n_threads = strtol(optarg,&tmp,10);
+ if ( *tmp ) error("Could not parse argument: --threads %s\n", optarg);
+ break;
+ case 'o': outfn = optarg; break;
default: usage();
}
}
- if ( optind==argc ) usage();
if (stats>2)
{
fprintf(pysam_stderr, "[E::%s] expected only one of --stats or --nrecords options\n", __func__);
return 1;
}
- char *fname = argv[optind];
- if (stats) return vcf_index_stats(fname, stats);
-
- htsFile *fp = hts_open(fname,"r");
- if ( !fp ) error("Failed to read %s\n", fname);
- htsFormat type = *hts_get_format(fp);
- hts_close(fp);
-
- if ( (type.format!=bcf && type.format!=vcf) || type.compression!=bgzf )
+ char *fname = NULL;
+ if ( optind>=argc )
{
- fprintf(pysam_stderr, "[E::%s] unknown filetype; expected bgzip compressed VCF or BCF\n", __func__);
- if ( type.compression!=bgzf )
- fprintf(pysam_stderr, "[E::%s] was the VCF/BCF compressed with bgzip?\n", __func__);
- return 1;
- }
- if (tbi && type.format==bcf)
- {
- fprintf(pysam_stderr, "[Warning] TBI-index does not work for BCF files. Generating CSI instead.\n");
- tbi = 0; min_shift = BCF_LIDX_SHIFT;
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage();
}
- if (min_shift == 0 && type.format==bcf)
- {
- fprintf(pysam_stderr, "[E::%s] Require min_shift>0 for BCF files.\n", __func__);
- return 1;
- }
- if (!tbi && type.format==vcf && min_shift == 0)
+ else fname = argv[optind];
+ if (stats) return vcf_index_stats(fname, stats);
+
+ kstring_t idx_fname = {0,0,0};
+ if (outfn)
+ kputs(outfn,&idx_fname);
+ else
{
- fprintf(pysam_stderr, "[Warning] min-shift set to 0 for VCF file. Generating TBI file.\n");
- tbi = 1;
+ if (!strcmp(fname, "-")) { fprintf(pysam_stderr, "[E::%s] must specify an output path for index file when reading VCF/BCF from stdin\n", __func__); return 1; }
+ ksprintf(&idx_fname, "%s.%s", fname, tbi ? "tbi" : "csi");
}
-
if (!force)
{
// Before complaining about existing index, check if the VCF file isn't newer.
- char *idx_fname = (char*)alloca(strlen(fname) + 5);
- strcat(strcpy(idx_fname, fname), tbi ? ".tbi" : ".csi");
struct stat stat_tbi, stat_file;
- if ( stat(idx_fname, &stat_tbi)==0 )
+ if ( stat(idx_fname.s, &stat_tbi)==0 )
{
stat(fname, &stat_file);
if ( stat_file.st_mtime <= stat_tbi.st_mtime )
{
- fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite.\n", __func__);
+ fprintf(pysam_stderr,"[E::%s] the index file exists. Please use '-f' to overwrite %s\n", __func__, idx_fname.s);
+ free(idx_fname.s);
return 1;
}
}
}
- if (type.format==bcf)
- {
- if ( bcf_index_build(fname, min_shift) != 0 )
- {
- fprintf(pysam_stderr,"[E::%s] bcf_index_build failed for %s\n", __func__, fname);
- return 1;
- }
- }
- else
- {
- if ( tbx_index_build(fname, min_shift, &tbx_conf_vcf) != 0 )
- {
- fprintf(pysam_stderr,"[E::%s] tbx_index_build failed for %s\n", __func__, fname);
- return 1;
- }
+ int ret = bcf_index_build3(fname, idx_fname.s, min_shift, n_threads);
+ free(idx_fname.s);
+ if (ret != 0) {
+ if (ret == -2)
+ error("index: failed to open \"%s\"\n", fname);
+ else if (ret == -3)
+ error("index: \"%s\" is in a format that cannot be usefully indexed\n", fname);
+ else
+ error("index: failed to create index for \"%s\"\n", fname);
}
return 0;
}
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <htslib/faidx.h>
#include <math.h>
#include <ctype.h>
+#include <time.h>
#include "bcftools.h"
+#include "regidx.h"
#include "vcmp.h"
+#define DBG 0
+
#include <htslib/khash.h>
KHASH_MAP_INIT_STR(strdict, int)
typedef khash_t(strdict) strdict_t;
-#define SKIP_DONE 1
-#define SKIP_DIFF 2
+#define FLT_LOGIC_ADD 0
+#define FLT_LOGIC_REMOVE 1
+
+#define SKIP_DONE 1 // the record was processed
+#define SKIP_DIFF 2 // not compatible, merge later
#define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G)
#define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A)
#define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R)
+#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+
// For merging INFO Number=A,G,R tags
typedef struct
{
void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule);
int type; // one of BCF_HT_*
int block_size; // number of values in a block
+ int type_size; // size of the corresponding BCF_HT_* type
int nblocks; // number of blocks in nvals (the number of merged files)
int nvals, mvals; // used and total size of vals array
void *vals; // the info tag values
}
info_rule_t;
+typedef struct
+{
+ bcf1_t *line;
+ int end, active;
+}
+gvcf_aux_t;
+
// Auxiliary merge data for selecting the right combination
// of buffered records across multiple readers. maux1_t
// corresponds to one buffered line.
typedef struct
{
int skip;
- int *map; // mapping from input alleles to the output array
+ int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles)
int mmap; // size of map array (only buffer[i].n_allele is actually used)
int als_differ;
}
maux1_t;
typedef struct
{
- int n; // number of readers
+ int rid; // current rid
+ int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+ int cur; // current line or -1 if none
+ int npos; // number of unprocessed lines at this position
+ int mrec; // allocated size of buf
+ maux1_t *rec; // buffer to keep reader's lines
+ bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+}
+buffer_t;
+typedef struct
+{
+ int n, pos, var_types; // number of readers, current position, currently available variant types
+ char *chr; // current chromosome
char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
int nals, mals, nout_als, mout_als; // size of the output array
int *cnt, ncnt; // number of records that refer to the alleles
- int *nbuf; // readers have buffers of varying lengths
int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
- int *flt, mflt, minf;
- bcf_info_t *inf;// out_line's INFO fields
bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
int nfmt_map; // number of rows in the fmt_map array
int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
void *tmp_arr;
int ntmp_arr;
- maux1_t **d; // d[i][j] i-th reader, j-th buffer line
+ buffer_t *buf;
AGR_info_t *AGR_info;
int nAGR_info, mAGR_info;
bcf_srs_t *files;
- int *has_line; // which files are being merged
+ int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present
+ gvcf_aux_t *gvcf; // buffer of gVCF lines
}
maux_t;
{
vcmp_t *vcmp;
maux_t *maux;
- int header_only, collapse, output_type, force_samples, merge_by_id;
+ regidx_t *regs; // apply regions only after the blocks are expanded
+ regitr_t *regs_itr;
+ int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
+ faidx_t *gvcf_fai;
info_rule_t *rules;
int nrules;
strdict_t *tmph;
}
args_t;
+static bcf1_t *maux_get_line(args_t *args, int i)
+{
+ maux_t *ma = args->maux;
+ int ibuf = ma->buf[i].cur;
+ if ( ibuf >= 0 ) return ma->buf[i].lines[ibuf];
+ return NULL;
+}
+
static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
{
if ( !rule->nvals ) return;
if ( str.l ) kputc(',',&str);
kputs("DP4:sum",&str);
}
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "QS")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("QS:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("MinDP:min",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "I16")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("I16:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IDV")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IDV:max",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IMF")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IMF:max",&str);
+ }
+
if ( !str.l ) return;
args->info_rules = str.s;
}
int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
- if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+ if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
+ else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
+ else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
+ else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
- while ( *ss ) ss++; ss++;
+ ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
int is_join = 0;
error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
}
- while ( *ss ) ss++; ss++; n++;
+ ss = strchr(ss, '\0'); ss++;
+ n++;
}
free(str.s);
free(tmp);
}
static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len)
{
- int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type);
+ int msize = args->maux->ntmp_arr / rule->type_size;
+ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type);
if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret);
+ args->maux->ntmp_arr = msize * rule->type_size;
rule->nblocks++;
int i, j;
if ( var_len==BCF_VL_A )
{
- assert( ret==line->n_allele-1 );
+ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
// create mapping from source file ALT indexes to dst file indexes
}
else if ( var_len==BCF_VL_R )
{
- assert( ret==line->n_allele );
+ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
{
for (i=0; i<*nb; i++)
{
+ if ( b[i][0]=='<' ) continue; // symbolic allele, do not modify
+ if ( b[i][0]=='*' ) continue; // overlapping deletion (*), do not modify
int l = strlen(b[i]);
b[i] = (char*) realloc(b[i],l+rla-rlb+1);
memcpy(b[i]+l,a[0]+rlb,rla-rlb+1);
// now check if the $a alleles are present and if not add them
for (i=1; i<na; i++)
{
+ int const_ai = 1;
char *ai;
- if ( rlb>rla ) // $a alleles need expanding
+ if ( rlb>rla && a[i][0]!='<' && a[i][0]!='*' ) // $a alleles need expanding and not a symbolic allele or *
{
int l = strlen(a[i]);
ai = (char*) malloc(l+rlb-rla+1);
memcpy(ai,a[i],l);
memcpy(ai+l,b[0]+rla,rlb-rla+1);
+ const_ai = 0;
}
else
ai = a[i];
if ( j<*nb ) // $b already has the same allele
{
map[i] = j;
- if ( rlb>rla ) free(ai);
+ if ( !const_ai ) free(ai);
continue;
}
// new allele
map[i] = *nb;
- b[*nb] = rlb>rla ? ai : strdup(ai);
+ if ( b[*nb] ) free(b[*nb]);
+ b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
return b;
}
-maux_t *maux_init(bcf_srs_t *files)
+maux_t *maux_init(args_t *args)
{
+ bcf_srs_t *files = args->files;
maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t));
ma->n = files->nreaders;
- ma->nbuf = (int *) calloc(ma->n,sizeof(int));
- ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*));
ma->files = files;
int i, n_smpl = 0;
for (i=0; i<ma->n; i++)
n_smpl += bcf_hdr_nsamples(files->readers[i].header);
+ if ( args->do_gvcf )
+ {
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+ for (i=0; i<ma->n; i++)
+ ma->gvcf[i].line = bcf_init1();
+ }
ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int));
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
- ma->has_line = (int*) malloc(ma->n*sizeof(int));
+ ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
+ for (i=0; i<ma->n; i++)
+ ma->buf[i].rid = -1;
return ma;
}
void maux_destroy(maux_t *ma)
{
- int i;
+ int i,j;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
for (i=0; i<ma->n; i++) // for each reader
{
- if ( !ma->d[i] ) continue;
- int j;
- for (j=0; j<ma->nbuf[i]; j++) // for each buffered line
- if ( ma->d[i][j].map ) free(ma->d[i][j].map);
- free(ma->d[i]);
+ for (j=0; j<ma->buf[i].mrec; j++) // for each buffered line
+ free(ma->buf[i].rec[j].map);
+ free(ma->buf[i].rec);
+ }
+ free(ma->buf);
+ if ( ma->gvcf )
+ {
+ for (i=0; i<ma->n; i++) bcf_destroy(ma->gvcf[i].line);
+ free(ma->gvcf);
}
for (i=0; i<ma->mAGR_info; i++)
free(ma->AGR_info[i].buf);
if (ma->ntmp_arr) free(ma->tmp_arr);
if (ma->nfmt_map) free(ma->fmt_map);
// ma->inf freed in bcf_destroy1
- free(ma->d);
- free(ma->nbuf);
for (i=0; i<ma->mals; i++) free(ma->als[i]);
if (ma->mout_als) free(ma->out_als);
free(ma->als);
free(ma->cnt);
free(ma->smpl_ploidy);
free(ma->smpl_nGsize);
- free(ma->has_line);
+ free(ma->chr);
free(ma);
}
-void maux_expand1(maux_t *ma, int i)
+void maux_expand1(buffer_t *buf, int size)
{
- if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer )
+ if ( buf->mrec < size )
{
- int n = ma->files->readers[i].nbuffer + 1;
- ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n);
- memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i]));
- ma->nbuf[i] = n;
+ hts_expand0(maux1_t,size,buf->mrec,buf->rec);
+ buf->mrec = size;
}
}
void maux_reset(maux_t *ma)
{
- int i;
- for (i=0; i<ma->n; i++) maux_expand1(ma, i);
- for (i=1; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ int i,j;
+ for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
+ for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
+ const char *chr = NULL;
+ ma->nals = 0;
+ ma->pos = -1;
+ for (i=0; i<ma->n; i++)
+ {
+ if ( !bcf_sr_has_line(ma->files,i) ) continue;
+ bcf1_t *line = bcf_sr_get_line(ma->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ chr = bcf_seqname(hdr,line);
+ ma->pos = line->pos;
+ break;
+ }
+ if ( chr )
+ {
+ free(ma->chr);
+ ma->chr = strdup(chr);
+ }
+ for (i=0; i<ma->n; i++)
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ ma->buf[i].rid = bcf_hdr_name2id(hdr,chr);
+ ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1;
+ for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
+ {
+ ma->buf[i].rec[j].skip = 0;
+ bcf1_t *line = ma->files->readers[i].buffer[j];
+ if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
+ }
+ ma->buf[i].end = j;
+ ma->buf[i].cur = -1;
+ if ( ma->buf[i].beg < ma->buf[i].end )
+ {
+ ma->buf[i].lines = ma->files->readers[i].buffer;
+ if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record
+ }
+ }
}
void maux_debug(maux_t *ma, int ir, int ib)
{
out->pos = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_unpack(line, BCF_UN_ALL);
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- // alleles
+ // not all maux alleles are always used, mark the ones we'll need
int j;
for (j=1; j<line->n_allele; j++)
- al_idxs[ ma->d[i][0].map[j] ] = 1;
+ {
+ int irec = ma->buf[i].cur;
+ al_idxs[ ma->buf[i].rec[irec].map[j] ] = 1;
+ }
// position
if ( out->pos==-1 )
}
// set QUAL to the max qual value. Not exactly correct, but good enough for now
- if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) )
+ if ( !bcf_float_is_missing(line->qual) )
{
- if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual;
+ if ( bcf_float_is_missing(out->qual) || out->qual < line->qual ) out->qual = line->qual;
}
}
// set ID
if ( !tmps->l ) kputs(".", tmps);
- if ( out->d.id ) free(out->d.id);
- out->d.id = strdup(tmps->s);
+ bcf_update_id(out_hdr, out, tmps->s);
// set alleles
ma->nout_als = 0;
int ir, j;
for (ir=0; ir<files->nreaders; ir++)
{
- if ( !ma->has_line[ir] ) continue;
- bcf1_t *line = files->readers[ir].buffer[0];
+ bcf1_t *line = maux_get_line(args,ir);
+ if ( !line ) continue;
for (j=1; j<line->n_allele; j++)
- if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als;
+ {
+ int irec = ma->buf[ir].cur;
+ if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+ }
}
}
// Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block.
bcf_hdr_t *out_hdr = args->out_hdr;
int i, ret;
+ if ( args->filter_logic == FLT_LOGIC_REMOVE )
+ {
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ if ( bcf_has_filter(hdr, line, "PASS") ) break;
+ }
+ if ( i<files->nreaders )
+ {
+ int flt_id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
+ bcf_add_filter(out_hdr, out, flt_id);
+ return;
+ }
+ }
+
khiter_t kitr;
strdict_t *tmph = args->tmph;
kh_clear(strdict, tmph);
- maux_t *ma = args->maux;
out->d.n_flt = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i]) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- bcf_unpack(line, BCF_UN_ALL);
int k;
for (k=0; k<line->d.n_flt; k++)
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt);
- hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
- ma->flt[out->d.n_flt] = id;
+ hts_expand(int,out->d.n_flt+1,out->d.m_flt,out->d.flt);
+ out->d.flt[out->d.n_flt] = id;
out->d.n_flt++;
kh_put(strdict, tmph, flt, &ret);
}
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
for (i=0; i<out->d.n_flt; i++)
- if ( ma->flt[i]==id ) break;
+ if ( out->d.flt[i]==id ) break;
if ( i<out->d.n_flt )
{
out->d.n_flt--;
- for (; i<out->d.n_flt; i++) ma->flt[i] = ma->flt[i+1];
+ for (; i<out->d.n_flt; i++) out->d.flt[i] = out->d.flt[i+1];
}
}
- out->d.flt = ma->flt;
}
static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str)
{
- assert( !info->vptr_free );
-
uint8_t *ptr = info->vptr - info->vptr_off;
bcf_dec_typed_int1(ptr, &ptr);
kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str);
info->vptr = (uint8_t*) tmp_str->s + info->vptr_off;
- info->vptr_free = 1;
- line->d.shared_dirty |= BCF1_DIRTY_INF;
tmp_str->s = NULL;
tmp_str->m = 0;
tmp_str->l = 0;
info_rules_reset(args);
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
+ int irec = ma->buf[i].cur;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_info; j++)
{
info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key);
if ( rule )
{
- maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL;
+ maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->buf[i].rec[irec] : NULL;
if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue;
}
}
{
if ( kitr == kh_end(tmph) )
{
- // first occurance in this reader, alloc arrays
+ // seeing this key for the first time
ma->nAGR_info++;
hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info);
kitr = kh_put(strdict, tmph, key, &ret);
kitr = kh_get(strdict, tmph, key);
int idx = kh_val(tmph, kitr);
if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1);
- merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]);
+ merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]);
continue;
}
if ( kitr == kh_end(tmph) )
{
- hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf);
- ma->inf[out->n_info].key = id;
- ma->inf[out->n_info].type = inf->type;
- ma->inf[out->n_info].len = inf->len;
- ma->inf[out->n_info].vptr = inf->vptr;
- ma->inf[out->n_info].v1.i = inf->v1.i;
- ma->inf[out->n_info].v1.f = inf->v1.f;
- ma->inf[out->n_info].vptr_off = inf->vptr_off;
- ma->inf[out->n_info].vptr_len = inf->vptr_len;
- ma->inf[out->n_info].vptr_free = inf->vptr_free;
+ // Seeing this key for the first time. Although quite hacky,
+ // this is faster than anything else given the data structures..
+
+ hts_expand0(bcf_info_t,out->n_info+1,out->d.m_info,out->d.info);
+ out->d.info[out->n_info].key = id;
+ out->d.info[out->n_info].type = inf->type;
+ out->d.info[out->n_info].len = inf->len;
+ out->d.info[out->n_info].v1.i = inf->v1.i;
+ out->d.info[out->n_info].v1.f = inf->v1.f;
+ out->d.info[out->n_info].vptr_off = inf->vptr_off;
+ out->d.info[out->n_info].vptr_len = inf->vptr_len;
+ out->d.info[out->n_info].vptr_free = 1;
+ out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off);
+ memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off);
+ out->d.info[out->n_info].vptr += inf->vptr_off;
if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) )
- {
- // The existing packed info cannot be reused. Change the id.
- // Although quite hacky, it's faster than anything else given
- // the data structures
- bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps);
- }
+ bcf_info_set_id(out, &out->d.info[out->n_info], id, &args->tmps);
+ out->d.shared_dirty |= BCF1_DIRTY_INF;
out->n_info++;
kitr = kh_put(strdict, tmph, key, &ret);
kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value
}
}
}
- out->d.info = ma->inf;
- out->d.m_info = ma->minf;
for (i=0; i<args->nrules; i++)
args->rules[i].merger(args->out_hdr, out, &args->rules[i]);
for (i=0; i<ma->nAGR_info; i++)
}
memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
+ int default_gt = args->missing_to_ref ? bcf_gt_unphased(0) : bcf_gt_missing;
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize;
+ int irec = ma->buf[i].cur;
int j, k;
if ( !fmt_ori )
// missing values: assume maximum ploidy
for (j=0; j<bcf_hdr_nsamples(hdr); j++)
{
- for (k=0; k<nsize; k++) { tmp[k] = 0; ma->smpl_ploidy[ismpl+j]++; }
+ for (k=0; k<nsize; k++) { tmp[k] = default_gt; ma->smpl_ploidy[ismpl+j]++; }
tmp += nsize;
}
ismpl += bcf_hdr_nsamples(hdr);
#define BRANCH(type_t, vector_end) { \
type_t *p_ori = (type_t*) fmt_ori->p; \
- if ( !ma->d[i][0].als_differ ) \
+ if ( !ma->buf[i].rec[irec].als_differ ) \
{ \
/* the allele numbering is unchanged */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
else \
{ \
int al = (p_ori[k]>>1) - 1; \
- al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \
+ al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \
tmp[k] = (al << 1) | ((p_ori[k])&1); \
} \
} \
int nsize = 0, length = BCF_VL_FIXED, type = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ if ( !maux_get_line(args,i) ) continue;
if ( !fmt_map[i] ) continue;
if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key;
type = fmt_map[i]->type;
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = ma->buf[i].cur;
if ( fmt_ori )
{
type = fmt_ori->type;
- int nals_ori = reader->buffer[0]->n_allele;
+ int nals_ori = line->n_allele;
if ( length==BCF_VL_G )
{
// if all fields are missing then n==1 is valid
ismpl += bcf_hdr_nsamples(hdr); \
continue; \
} \
- assert( ma->has_line[i] ); \
- bcf1_t *line = reader->buffer[0]; \
src_type_t *src = (src_type_t*) fmt_ori->p; \
- if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \
+ if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \
{ \
/* alleles unchanged, copy over */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
int iori, inew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
int iori,jori, inew,jnew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
for (jori=0; jori<=iori; jori++) \
{ \
- jnew = ma->d[i][0].map[jori]; \
+ jnew = ma->buf[i].rec[irec].map[jori]; \
int kori = iori*(iori+1)/2 + jori; \
int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
int iori,inew; \
for (iori=ifrom; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori] - ifrom; \
+ inew = ma->buf[i].rec[irec].map[iori] - ifrom; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
if ( src_is_missing ) tgt_set_missing; \
int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_fmt; j++)
{
ma->fmt_map[ifmt*files->nreaders+i] = fmt;
}
// Check if the allele numbering must be changed
- for (j=1; j<reader->buffer[0]->n_allele; j++)
- if ( ma->d[i][0].map[j]!=j ) break;
- ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1;
+ int irec = ma->buf[i].cur;
+ for (j=1; j<line->n_allele; j++)
+ if ( ma->buf[i].rec[irec].map[j]!=j ) break;
+ ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
}
out->n_sample = bcf_hdr_nsamples(out_hdr);
merge_GT(args, ma->fmt_map, out);
update_AN_AC(out_hdr, out);
- if ( out->d.info!=ma->inf )
- {
- // hacky, we rely on htslib internals: bcf_update_info() reallocated the info
- ma->inf = out->d.info;
- ma->minf = out->d.m_info;
- }
-
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
out->d.indiv_dirty = 1;
}
-// The core merging function, one or none line from each reader
-void merge_line(args_t *args)
+void gvcf_set_alleles(args_t *args)
+{
+ int i,k;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ maux->nals = 0;
+
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = maux->buf[i].cur;
+
+ hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map);
+ if ( !maux->nals ) // first record, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=0; k<maux->nals; k++)
+ {
+ if ( maux->als[k] ) free(maux->als[k]);
+ maux->als[k] = strdup(line->d.allele[k]);
+ maux->buf[i].rec[irec].map[k] = k;
+ }
+ }
+ else
+ {
+ maux->als = merge_alleles(line->d.allele, line->n_allele, maux->buf[i].rec[irec].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als )
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
+ error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1);
+ }
+ }
+ }
+}
+
+/*
+ Output staged gVCF blocks, end is the last position of the block. Assuming
+ gaux[i].active flags are set and maux_get_line returns correct lines.
+*/
+void gvcf_write_block(args_t *args, int start, int end)
{
+ int i;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ assert(gaux);
+
+ // Update POS
+ int min = INT_MAX;
+ char ref = 'N';
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( ref=='N' && gaux[i].line->pos==start ) ref = gaux[i].line->d.allele[0][0];
+ gaux[i].line->pos = start;
+ }
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < start )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ gaux[i].line->d.allele[0][0] = ref;
+ if ( min > gaux[i].end ) min = gaux[i].end;
+ }
+ // Check for valid gVCF blocks in this region
+ if ( min==INT_MAX )
+ {
+ assert(0);
+ maux->gvcf_min = 0;
+ return;
+ }
+
bcf1_t *out = args->out_line;
- bcf_clear1(out);
- out->unpacked = BCF_UN_ALL;
+ gvcf_set_alleles(args);
+
+ // Merge the staged lines
merge_chrom2qual(args, out);
merge_filter(args, out);
merge_info(args, out);
merge_format(args, out);
- bcf_write1(args->out_fh, args->out_hdr, out);
-}
+ if ( args->gvcf_fai && out->d.allele[0][0]=='N' )
+ {
+ int slen = 0;
+ char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+ if (slen)
+ {
+ out->d.allele[0][0] = seq[0];
+ free(seq);
+ }
+ }
+ // Update END boundary
+ if ( end > start )
+ {
+ end++;
+ bcf_update_info_int32(args->out_hdr, out, "END", &end, 1);
+ }
+ else
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
-void debug_buffers(FILE *fp, bcf_srs_t *files);
-void debug_buffer(FILE *fp, bcf_sr_t *reader);
-#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+ // Inactivate blocks which do not extend beyond END and find new gvcf_min
+ min = INT_MAX;
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < end )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ // next min END position bigger than the current one
+ if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1;
+ }
+ maux->gvcf_min = min==INT_MAX ? 0 : min;
+}
-// Clean the reader's buffer to and make it ready for the next next_line() call.
-// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put
-// the rest to the beggining. Then shorten the buffer so that the last element
-// points to the last unfinished record. There are two special cases: the last
-// line of the buffer typically has a different position and must stay at the
-// end; next, the first record of the buffer must be one of those already
-// printed, as it will be discarded by next_line().
-//
-void shake_buffer(maux_t *maux, int ir, int pos)
+/*
+ Flush staged gVCF blocks. Flush everything if there are no more lines
+ (done=1) or if there is a new chromosome. If still on the same chromosome,
+ all hanging blocks must be ended by creating new records:
+ A
+ 1 END=10
+ B
+ 3 END=7
+ C
+ 3 END=5
+ out
+ 1 END=2 A . .
+ 3 END=5 A B C
+ 6 END=7 A B .
+ 8 END=10 A . .
+
+*/
+void gvcf_flush(args_t *args, int done)
{
- bcf_sr_t *reader = &maux->files->readers[ir];
- maux1_t *m = maux->d[ir];
-
- if ( !reader->buffer ) return;
-
int i;
- // FILE *fp = stdout;
- // fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
- // debug_buffer(fp,reader);
- // fprintf(fp,"--\n");
+ maux_t *maux = args->maux;
- int a = 1, b = reader->nbuffer;
- if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards
+ if ( !maux->chr ) return; // first time here, nothing to flush
- while ( a<b )
+ int flush_until = INT_MAX;
+ if ( !done )
{
- if ( !(m[a].skip&SKIP_DONE) ) { a++; continue; }
- if ( m[b].skip&SKIP_DONE ) { b--; continue; }
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[b]);
- SWAP(maux1_t, m[a], m[b]);
- a++;
- b--;
- }
+ // Get current position and chromosome
+ for (i=0; i<maux->n; i++)
+ if ( bcf_sr_has_line(maux->files,i) ) break;
+ bcf1_t *line = bcf_sr_get_line(maux->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(maux->files,i);
- // position $a to the after the first unfinished record
- while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++;
+ if ( !strcmp(maux->chr,bcf_seqname(hdr,line)) ) flush_until = line->pos; // still on the same chr
+ }
- if ( a<reader->nbuffer )
+ // When called on a region, trim the blocks accordingly
+ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos;
+ if ( args->regs )
{
- // there is a gap between the unfinished lines at the beggining and the
- // last line. The last line must be brought forward to fill the gap
- if ( reader->buffer[reader->nbuffer]->pos != pos )
+ int rstart = -1, rend = -1;
+ if ( regidx_overlap(args->regs,maux->chr,start,flush_until,args->regs_itr) )
{
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]);
- SWAP(maux1_t, m[a], m[reader->nbuffer]);
- reader->nbuffer = a;
+ // In case there are multiple regions, we treat them as one
+ rstart = args->regs_itr->beg;
+ while ( regitr_overlap(args->regs_itr) ) rend = args->regs_itr->end;
}
+ if ( rstart > start ) start = rstart;
+ if ( rend < flush_until ) flush_until = rend+1;
}
- if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos )
+ // output all finished blocks
+ while ( maux->gvcf_min && start < flush_until )
{
- // the first record is unfinished, replace it with an empty line
- // from the end of the buffer or else next_line will remove it
- if ( reader->nbuffer + 1 >= maux->nbuf[ir] )
+ // does the block end before the new line or is it interrupted?
+ int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until;
+ if ( start > tmp-1 ) break;
+ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based
+ start = tmp;
+ }
+}
+
+/*
+ Check incoming lines for new gVCF blocks, set pointer to the current source
+ buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
+ called only after maux_reset as it relies on updated maux buffers.
+*/
+void gvcf_stage(args_t *args, int pos)
+{
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ bcf_srs_t *files = args->files;
+ int32_t *end = (int32_t*) maux->tmp_arr;
+ int i, nend = maux->ntmp_arr / sizeof(int32_t);
+
+ maux->gvcf_break = -1;
+ maux->gvcf_min = INT_MAX;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( gaux[i].active )
{
- reader->nbuffer++;
- maux_expand1(maux, ir);
- reader->nbuffer--;
- m = maux->d[ir];
+ // gvcf block should not overlap with another record
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+ continue;
}
- if ( reader->nbuffer+1 >= reader->mbuffer )
- error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer);
- if ( reader->buffer[reader->nbuffer]->pos!=pos )
+ // Does any of the lines have END set? It is enough to check only the
+ // first line, there should be no duplicate records with END in gVCF
+
+ if ( maux->buf[i].beg==maux->buf[i].end ) continue; // no new record
+
+ int irec = maux->buf[i].beg;
+ bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
+ bcf1_t *line = args->files->readers[i].buffer[irec];
+ int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+ if ( ret==1 )
{
- // 4way swap
- bcf1_t *tmp = reader->buffer[0];
- reader->buffer[0] = reader->buffer[reader->nbuffer+1];
- reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer];
- reader->buffer[reader->nbuffer] = tmp;
- m[reader->nbuffer].skip = m[0].skip;
- m[reader->nbuffer+1].skip = SKIP_DIFF;
- reader->nbuffer++;
+ // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with
+ // an empty record: the gaux line must be kept until we reach its END.
+ gaux[i].active = 1;
+ gaux[i].end = end[0] - 1;
+ SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line);
+ gaux[i].line->pos = pos;
+
+ maux->buf[i].lines = &gaux[i].line;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+
+ // Set the rid,pos of the swapped line in the buffer or else the
+ // synced reader will have a problem with the next line
+ //
+ args->files->readers[i].buffer[irec]->rid = maux->buf[i].rid;
+ args->files->readers[i].buffer[irec]->pos = maux->pos;
+
+ // Update block offsets
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
}
else
- {
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]);
- SWAP(maux1_t, m[0], m[reader->nbuffer+1]);
- }
+ maux->gvcf_break = line->pos; // must break the gvcf block
}
+ maux->ntmp_arr = nend * sizeof(int32_t);
+ maux->tmp_arr = end;
+ if ( maux->gvcf_min==INT_MAX ) maux->gvcf_min = 0;
+}
+
+
+void debug_buffers(FILE *fp, bcf_srs_t *files);
+void debug_buffer(FILE *fp, bcf_srs_t *files, int reader);
+
+/*
+ Flush all buffered and processed records with the same coordinate.
+ Note that synced reader discards buffer[0], so that needs to stay
+ untouched.
+*/
+void clean_buffer(args_t *args)
+{
+ maux_t *ma = args->maux;
+
+ int ir;
+ for (ir=0; ir<ma->n; ir++)
+ {
+ // Invalidate pointer to reader's buffer or else gvcf_flush will attempt
+ // to use the old lines via maux_get_line()
+ if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
- // debug_buffer(fp,reader);
- // fprintf(fp,"<shaken>\t"); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]);
- // fprintf(fp,"\n\n");
+ bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir);
+ if ( !reader->nbuffer ) continue; // nothing to clean
- // set position of finished buffer[0] line to -1, otherwise swapping may
- // bring it back after next_line()
- reader->buffer[0]->pos = -1;
+ bcf1_t **buf = reader->buffer;
+ if ( buf[1]->rid!=ma->buf[ir].rid || buf[1]->pos!=ma->pos ) continue; // nothing to flush
- // trim the buffer, remove finished lines from the end
- i = reader->nbuffer;
- while ( i>=1 && m[i--].skip&SKIP_DONE )
- reader->nbuffer--;
+ int a = 1, b = 2;
+ while ( b<=reader->nbuffer && buf[b]->rid==ma->buf[ir].rid && buf[b]->pos==ma->pos ) b++;
+ // b now points to the first line we want to preserve
+ while ( b<=reader->nbuffer )
+ {
+ SWAP(bcf1_t*, buf[a], buf[b]);
+ a++; b++;
+ }
+ reader->nbuffer -= b-a;
+ }
}
-void debug_maux(args_t *args, int pos, int var_type)
+void debug_maux(args_t *args)
{
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
int j,k,l;
- fprintf(stderr,"Alleles to merge at %d\n", pos+1);
+ fprintf(stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals);
for (j=0; j<files->nreaders; j++)
{
bcf_sr_t *reader = &files->readers[j];
+ buffer_t *buf = &maux->buf[j];
fprintf(stderr," reader %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
+ for (k=buf->beg; k<buf->end; k++)
{
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ if ( buf->rec[k].skip & SKIP_DONE ) continue;
bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
fprintf(stderr,"\t");
- if ( maux->d[j][k].skip ) fprintf(stderr,"["); // this record will not be merged in this round
+ if ( buf->rec[k].skip ) fprintf(stderr,"["); // this record will not be merged in this round
for (l=0; l<line->n_allele; l++)
fprintf(stderr,"%s%s", l==0?"":",", line->d.allele[l]);
- if ( maux->d[j][k].skip ) fprintf(stderr,"]");
+ if ( buf->rec[k].skip ) fprintf(stderr,"]");
}
fprintf(stderr,"\n");
}
fprintf(stderr," counts: ");
- for (j=0; j<maux->nals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(stderr,"\n");
- for (j=0; j<files->nreaders; j++)
- {
- bcf_sr_t *reader = &files->readers[j];
- fprintf(stderr," out %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
- {
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
- bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
- if ( maux->d[j][k].skip ) continue;
- fprintf(stderr,"\t");
- for (l=0; l<line->n_allele; l++)
- fprintf(stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
- }
- fprintf(stderr,"\n");
- }
- fprintf(stderr,"\n");
+ for (j=0; j<maux->nals; j++) fprintf(stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]);
+ fprintf(stderr,"\n\n");
}
-// Determine which line should be merged from which reader: go through all
-// readers and all buffered lines, expand REF,ALT and try to match lines with
-// the same ALTs. A step towards output independent on input ordering of the
-// lines.
-void merge_buffer(args_t *args)
+
+/*
+ Determine which line should be merged from which reader: go through all
+ readers and all buffered lines, expand REF,ALT and try to match lines with
+ the same ALTs.
+ */
+int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
- int i, pos = -1, var_type = 0;
- char *id = NULL;
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
- maux_reset(maux);
+ gvcf_aux_t *gaux = maux->gvcf;
+ char *id = NULL, ref = 'N';
+ maux->var_types = maux->nals = 0;
- // set the current position
+ int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( bcf_sr_has_line(files,i) )
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = bcf_sr_get_line(files,i);
- pos = line->pos;
- var_type = bcf_get_variant_types(line);
- id = line->d.id;
- break;
+ // skip readers with active gvcf blocks
+ buf->rec[buf->beg].skip = SKIP_DIFF;
+ continue;
+ }
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ buf->rec[j].skip = SKIP_DIFF;
+ ntodo++;
+
+ if ( args->merge_by_id )
+ id = buf->lines[j]->d.id;
+ else
+ {
+ int var_type = bcf_get_variant_types(buf->lines[j]);
+ maux->var_types |= var_type ? var_type<<1 : 1;
+ }
}
+
+ // for gvcf: find out REF at this position
+ if ( buf->beg < buf->end && ref=='N' )
+ ref = buf->lines[buf->beg]->d.allele[0][0];
}
+ if ( !ntodo ) return 0;
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
- int j, k;
- for (j=0; j<=reader->nbuffer; j++)
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = reader->buffer[j];
+ gaux[i].line->d.allele[0][0] = ref;
+ gaux[i].line->pos = maux->pos;
+ }
+
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
+
int line_type = bcf_get_variant_types(line);
+ line_type = line_type ? line_type<<1 : 1;
+
// select relevant lines
- maux->d[i][j].skip = SKIP_DIFF;
- if ( pos!=line->pos )
- {
- if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore
- continue;
- }
if ( args->merge_by_id )
{
if ( strcmp(id,line->d.id) ) continue;
{
// All alleles of the tested record must be present in the
// selected maux record plus variant types must be the same
- if ( var_type!=line->d.var_type ) continue;
+ if ( (maux->var_types & line_type) != line_type ) continue;
if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
for (k=1; k<line->n_allele; k++)
{
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
}
- if ( k==line->n_allele ) continue; // no matching allele
+ if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele
}
if ( !(args->collapse&COLLAPSE_ANY) )
{
- int compatible = 0;
- if ( line_type==var_type ) compatible = 1;
- else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything
- else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1;
- else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1;
- if ( !compatible ) continue;
+ // Merge:
+ // - SNPs+SNPs+MNPs+REF if -m both,snps
+ // - indels+indels+REF if -m both,indels, REF only if SNPs are not present
+ // - SNPs come first
+ if ( line_type & indel_mask )
+ {
+ if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
+ if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
+ }
}
}
- maux->d[i][j].skip = 0;
+ buf->rec[j].skip = 0;
- hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map);
+ hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
if ( !maux->nals ) // first record, copy the alleles to the output
{
maux->nals = line->n_allele;
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=0; k<maux->nals; k++)
{
+ free(maux->als[k]);
maux->als[k] = strdup(line->d.allele[k]);
- maux->d[i][j].map[k] = k;
+ buf->rec[j].map[k] = k;
maux->cnt[k] = 1;
}
- pos = line->pos;
continue;
}
-
// normalize alleles
- maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals);
- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname);
+ maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(args->out_hdr,line),line->pos+1,reader->fname);
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=1; k<line->n_allele; k++)
- maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files
+ maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files
maux->cnt[0]++;
}
}
+ return 1;
+}
- // debug_maux(args, pos, var_type);
+/*
+ Select records that have the same alleles; the input ordering of indels
+ must not matter. Multiple VCF lines can be emitted from this loop.
+ We expect only very few alleles and not many records with the same
+ position in the buffers, therefore the nested loops should not slow us
+ much.
+*/
+void stage_line(args_t *args)
+{
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
- // Select records that have the same alleles; the input ordering of indels
- // must not matter. Multiple VCF lines can be emitted from this loop.
- // We expect only very few alleles and not many records with the same
- // position in the buffers, therefore the nested loops should not slow us
- // much.
- while (1)
+ // debug_maux(args);
+
+ // take the most frequent allele present in multiple files, REF is skipped
+ int i,j,k,icnt = 1;
+ for (i=2; i<maux->nals; i++)
+ if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+
+ int nout = 0;
+ for (i=0; i<files->nreaders; i++)
{
- // take the most frequent allele present in multiple files
- int icnt = 0;
- for (i=1; i<maux->nals; i++)
- if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
- if ( maux->cnt[icnt]<0 ) break;
+ buffer_t *buf = &maux->buf[i];
+ buf->cur = -1;
+ if ( buf->beg >= buf->end ) continue; // no lines in the buffer
- int nmask = 0;
- for (i=0; i<files->nreaders; i++)
+ // find lines with the same allele
+ for (j=buf->beg; j<buf->end; j++)
{
- maux->has_line[i] = 0;
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->merge_by_id ) break;
+ if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record
- bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
+ for (k=0; k<buf->lines[j]->n_allele; k++)
+ if ( icnt==buf->rec[j].map[k] ) break;
- // find lines with the same allele
- int j;
- for (j=0; j<=reader->nbuffer; j++)
- {
- if ( maux->d[i][j].skip ) continue;
- int k;
- for (k=0; k<reader->buffer[j]->n_allele; k++)
- if ( icnt==maux->d[i][j].map[k] ) break;
- if ( k<reader->buffer[j]->n_allele ) break;
- }
- if ( j>reader->nbuffer )
- {
- // no matching allele found in this file
- if ( args->collapse==COLLAPSE_NONE ) continue;
+ if ( k<buf->lines[j]->n_allele ) break;
+ }
+ if ( j>=buf->end )
+ {
+ // no matching allele found in this file
+ if ( args->collapse==COLLAPSE_NONE ) continue;
- for (j=0; j<=reader->nbuffer; j++)
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
+ int line_type = bcf_get_variant_types(buf->lines[j]);
+ if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type==VCF_REF )
{
- if ( maux->d[i][j].skip ) continue;
- if ( args->collapse&COLLAPSE_ANY ) break;
- int line_type = bcf_get_variant_types(reader->buffer[j]);
- if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- if ( line_type==VCF_REF )
- {
- if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
- else if ( var_type==VCF_REF )
- {
- if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
+ if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ref_mask ) break;
}
- }
- if ( j<=reader->nbuffer )
- {
- // found a suitable line for merging, place it at the beggining
- if ( j>0 )
+ else if ( maux->var_types&ref_mask )
{
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]);
- SWAP(maux1_t, maux->d[i][0], maux->d[i][j]);
+ if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
}
- // mark as finished so that it's ignored next time
- maux->d[i][0].skip |= SKIP_DONE;
- maux->has_line[i] = 1;
- nmask++;
}
}
- if ( !nmask ) break; // done, no more lines suitable for merging found
- merge_line(args); // merge and output the line
- maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished
+ if ( j<buf->end )
+ {
+ // found a suitable line for merging
+ buf->cur = j;
+
+ // mark as finished so that it's ignored next time
+ buf->rec[j].skip = SKIP_DONE;
+ nout++;
+ }
}
+ assert( nout );
+}
- // clean the alleles
- for (i=0; i<maux->nals; i++)
+void merge_line(args_t *args)
+{
+ if ( args->regs )
{
- free(maux->als[i]);
- maux->als[i] = 0;
+ if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
}
- maux->nals = 0;
- // get the buffers ready for the next next_line() call
- for (i=0; i<files->nreaders; i++)
- shake_buffer(maux, i, pos);
+ bcf1_t *out = args->out_line;
+ merge_chrom2qual(args, out);
+ merge_filter(args, out);
+ merge_info(args, out);
+ if ( args->do_gvcf )
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ merge_format(args, out);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
}
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd)
else
ksprintf(&str, " %s", argv[i]);
}
+ kputs("; Date=", &str);
+ time_t tm; time(&tm); kputs(ctime(&tm), &str);
kputc('\n', &str);
bcf_hdr_append(hdr,str.s);
free(str.s);
{
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
if ( args->header_fname )
}
if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
- args->maux = maux_init(args->files);
+ args->maux = maux_init(args);
args->out_line = bcf_init1();
args->tmph = kh_init(strdict);
- int ret;
- while ( (ret=bcf_sr_next_line(args->files)) )
+
+ while ( bcf_sr_next_line(args->files) )
{
- merge_buffer(args);
+ // output cached gVCF blocks which end before the new record
+ if ( args->do_gvcf )
+ gvcf_flush(args,0);
+
+ maux_reset(args->maux);
+
+ // determine which of the new records are gvcf blocks
+ if ( args->do_gvcf )
+ gvcf_stage(args, args->maux->pos);
+
+ while ( can_merge(args) )
+ {
+ stage_line(args);
+ merge_line(args);
+ }
+ clean_buffer(args);
}
+ if ( args->do_gvcf )
+ gvcf_flush(args,1);
+
info_rules_destroy(args);
maux_destroy(args->maux);
bcf_hdr_destroy(args->out_hdr);
fprintf(stderr, " --force-samples resolve duplicate sample names\n");
fprintf(stderr, " --print-header print only the merged header and exit\n");
fprintf(stderr, " --use-header <file> use the provided header\n");
+ fprintf(stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n");
fprintf(stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(stderr, " -F, --filter-logic <x|+> remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
+ fprintf(stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
fprintf(stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(stderr, " -l, --file-list <file> read file names from the file\n");
fprintf(stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
{
{"help",no_argument,NULL,'h'},
{"merge",required_argument,NULL,'m'},
+ {"gvcf",required_argument,NULL,'g'},
{"file-list",required_argument,NULL,'l'},
+ {"missing-to-ref",no_argument,NULL,'0'},
{"apply-filters",required_argument,NULL,'f'},
{"use-header",required_argument,NULL,1},
{"print-header",no_argument,NULL,2},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
+ {"filter-logic",required_argument,NULL,'F'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
switch (c) {
+ case 'F':
+ if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
+ else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
+ else error("Filter logic not recognised: %s\n", optarg);
+ break;
+ case '0': args->missing_to_ref = 1; break;
+ case 'g':
+ args->do_gvcf = 1;
+ if ( strcmp("-",optarg) )
+ {
+ args->gvcf_fai = fai_load(optarg);
+ if ( !args->gvcf_fai ) error("Failed to load the fai index: %s\n", optarg);
+ }
+ break;
case 'l': args->file_list = optarg; break;
case 'i': args->info_rules = optarg; break;
case 'o': args->output_fname = optarg; break;
if ( argc-optind<2 && !args->file_list ) usage();
args->files->require_index = 1;
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( regions_is_file )
+ args->regs = regidx_init(args->regions_list,NULL,NULL,sizeof(char*),NULL);
+ else
+ {
+ args->regs = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(args->regs,args->regions_list,',') !=0 ) error("Could not parse the regions: %s\n", args->regions_list);
+ regidx_insert(args->regs,NULL);
+ }
+ if ( !args->regs ) error("Could not parse the regions: %s\n", args->regions_list);
+ args->regs_itr = regitr_init(args->regs);
+ }
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
while (optind<argc)
{
if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
}
merge_vcf(args);
bcf_sr_destroy(args->files);
+ if ( args->regs ) regidx_destroy(args->regs);
+ if ( args->regs_itr ) regitr_destroy(args->regs_itr);
+ if ( args->gvcf_fai ) fai_destroy(args->gvcf_fai);
free(args);
return 0;
}
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
- Copyright (C) 2012-2014 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <stdio.h>
#include <string.h>
+#include <strings.h>
#include <errno.h>
#include <unistd.h>
#include <getopt.h>
#include <htslib/vcf.h>
#include <htslib/synced_bcf_reader.h>
#include <htslib/vcfutils.h>
+#include <htslib/faidx.h>
#include <math.h>
#include <ctype.h>
+#include <time.h>
#include "bcftools.h"
+#include "regidx.h"
#include "vcmp.h"
+#define DBG 0
+
#include <htslib/khash.h>
KHASH_MAP_INIT_STR(strdict, int)
typedef khash_t(strdict) strdict_t;
-#define SKIP_DONE 1
-#define SKIP_DIFF 2
+#define FLT_LOGIC_ADD 0
+#define FLT_LOGIC_REMOVE 1
+
+#define SKIP_DONE 1 // the record was processed
+#define SKIP_DIFF 2 // not compatible, merge later
#define IS_VL_G(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_G)
#define IS_VL_A(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_A)
#define IS_VL_R(hdr,id) (bcf_hdr_id2length(hdr,BCF_HL_FMT,id) == BCF_VL_R)
+#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+
// For merging INFO Number=A,G,R tags
typedef struct
{
void (*merger)(bcf_hdr_t *hdr, bcf1_t *line, struct _info_rule_t *rule);
int type; // one of BCF_HT_*
int block_size; // number of values in a block
+ int type_size; // size of the corresponding BCF_HT_* type
int nblocks; // number of blocks in nvals (the number of merged files)
int nvals, mvals; // used and total size of vals array
void *vals; // the info tag values
}
info_rule_t;
+typedef struct
+{
+ bcf1_t *line;
+ int end, active;
+}
+gvcf_aux_t;
+
// Auxiliary merge data for selecting the right combination
// of buffered records across multiple readers. maux1_t
// corresponds to one buffered line.
typedef struct
{
int skip;
- int *map; // mapping from input alleles to the output array
+ int *map; // mapping from input alleles to the array of output alleles (set by merge_alleles)
int mmap; // size of map array (only buffer[i].n_allele is actually used)
int als_differ;
}
maux1_t;
typedef struct
{
- int n; // number of readers
+ int rid; // current rid
+ int beg,end; // valid ranges in reader's buffer [beg,end). Maintained by maux_reset and gvcf_flush.
+ int cur; // current line or -1 if none
+ int npos; // number of unprocessed lines at this position
+ int mrec; // allocated size of buf
+ maux1_t *rec; // buffer to keep reader's lines
+ bcf1_t **lines; // source buffer: either gvcf or readers' buffer
+}
+buffer_t;
+typedef struct
+{
+ int n, pos, var_types; // number of readers, current position, currently available variant types
+ char *chr; // current chromosome
char **als, **out_als; // merged alleles (temp, may contain empty records) and merged alleles ready for output
int nals, mals, nout_als, mout_als; // size of the output array
int *cnt, ncnt; // number of records that refer to the alleles
- int *nbuf; // readers have buffers of varying lengths
int *smpl_ploidy, *smpl_nGsize; // ploidy and derived number of values in Number=G tags, updated for each line (todo: cache for missing cases)
- int *flt, mflt, minf;
- bcf_info_t *inf;// out_line's INFO fields
bcf_fmt_t **fmt_map; // i-th output FORMAT field corresponds in j-th reader to i*nreader+j, first row is reserved for GT
int nfmt_map; // number of rows in the fmt_map array
int *agr_map, nagr_map, magr_map; // mapping between Number=AGR element indexes
void *tmp_arr;
int ntmp_arr;
- maux1_t **d; // d[i][j] i-th reader, j-th buffer line
+ buffer_t *buf;
AGR_info_t *AGR_info;
int nAGR_info, mAGR_info;
bcf_srs_t *files;
- int *has_line; // which files are being merged
+ int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present
+ gvcf_aux_t *gvcf; // buffer of gVCF lines
}
maux_t;
{
vcmp_t *vcmp;
maux_t *maux;
- int header_only, collapse, output_type, force_samples, merge_by_id;
+ regidx_t *regs; // apply regions only after the blocks are expanded
+ regitr_t *regs_itr;
+ int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref;
char *header_fname, *output_fname, *regions_list, *info_rules, *file_list;
+ faidx_t *gvcf_fai;
info_rule_t *rules;
int nrules;
strdict_t *tmph;
}
args_t;
+static bcf1_t *maux_get_line(args_t *args, int i)
+{
+ maux_t *ma = args->maux;
+ int ibuf = ma->buf[i].cur;
+ if ( ibuf >= 0 ) return ma->buf[i].lines[ibuf];
+ return NULL;
+}
+
static void info_rules_merge_sum(bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule)
{
if ( !rule->nvals ) return;
if ( str.l ) kputc(',',&str);
kputs("DP4:sum",&str);
}
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "QS")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("QS:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "MinDP")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("MinDP:min",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "I16")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("I16:sum",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IDV")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IDV:max",&str);
+ }
+ if ( args->do_gvcf && bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, "IMF")) )
+ {
+ if ( str.l ) kputc(',',&str);
+ kputs("IMF:max",&str);
+ }
+
if ( !str.l ) return;
args->info_rules = str.s;
}
int id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, rule->hdr_tag);
if ( !bcf_hdr_idinfo_exists(args->out_hdr,BCF_HL_INFO,id) ) error("The tag is not defined in the header: \"%s\"\n", rule->hdr_tag);
rule->type = bcf_hdr_id2type(args->out_hdr,BCF_HL_INFO,id);
- if ( rule->type!=BCF_HT_INT && rule->type!=BCF_HT_REAL && rule->type!=BCF_HT_STR ) error("The type is not supported: \"%s\"\n", rule->hdr_tag);
+ if ( rule->type==BCF_HT_INT ) rule->type_size = sizeof(int32_t);
+ else if ( rule->type==BCF_HT_REAL ) rule->type_size = sizeof(float);
+ else if ( rule->type==BCF_HT_STR ) rule->type_size = sizeof(char);
+ else error("The type is not supported: \"%s\"\n", rule->hdr_tag);
- while ( *ss ) ss++; ss++;
+ ss = strchr(ss, '\0'); ss++;
if ( !*ss ) error("Could not parse INFO rules, missing logic of \"%s\"\n", rule->hdr_tag);
int is_join = 0;
error("Only fixed-length vectors are supported with -i %s:%s\n", ss, rule->hdr_tag);
}
- while ( *ss ) ss++; ss++; n++;
+ ss = strchr(ss, '\0'); ss++;
+ n++;
}
free(str.s);
free(tmp);
}
static int info_rules_add_values(args_t *args, bcf_hdr_t *hdr, bcf1_t *line, info_rule_t *rule, maux1_t *als, int var_len)
{
- int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &args->maux->ntmp_arr, rule->type);
+ int msize = args->maux->ntmp_arr / rule->type_size;
+ int ret = bcf_get_info_values(hdr, line, rule->hdr_tag, &args->maux->tmp_arr, &msize, rule->type);
if ( ret<=0 ) error("FIXME: error parsing %s at %s:%d .. %d\n", rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1,ret);
+ args->maux->ntmp_arr = msize * rule->type_size;
rule->nblocks++;
int i, j;
if ( var_len==BCF_VL_A )
{
- assert( ret==line->n_allele-1 );
+ if ( ret!=line->n_allele-1 ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
// create mapping from source file ALT indexes to dst file indexes
}
else if ( var_len==BCF_VL_R )
{
- assert( ret==line->n_allele );
+ if ( ret!=line->n_allele ) error("Wrong number of %s fields at %s:%d\n",rule->hdr_tag,bcf_seqname(hdr,line),line->pos+1);
args->maux->nagr_map = ret;
hts_expand(int,args->maux->nagr_map,args->maux->magr_map,args->maux->agr_map);
for (i=0; i<ret; i++) args->maux->agr_map[i] = als->map[i];
{
for (i=0; i<*nb; i++)
{
+ if ( b[i][0]=='<' ) continue; // symbolic allele, do not modify
+ if ( b[i][0]=='*' ) continue; // overlapping deletion (*), do not modify
int l = strlen(b[i]);
b[i] = (char*) realloc(b[i],l+rla-rlb+1);
memcpy(b[i]+l,a[0]+rlb,rla-rlb+1);
// now check if the $a alleles are present and if not add them
for (i=1; i<na; i++)
{
+ int const_ai = 1;
char *ai;
- if ( rlb>rla ) // $a alleles need expanding
+ if ( rlb>rla && a[i][0]!='<' && a[i][0]!='*' ) // $a alleles need expanding and not a symbolic allele or *
{
int l = strlen(a[i]);
ai = (char*) malloc(l+rlb-rla+1);
memcpy(ai,a[i],l);
memcpy(ai+l,b[0]+rla,rlb-rla+1);
+ const_ai = 0;
}
else
ai = a[i];
if ( j<*nb ) // $b already has the same allele
{
map[i] = j;
- if ( rlb>rla ) free(ai);
+ if ( !const_ai ) free(ai);
continue;
}
// new allele
map[i] = *nb;
- b[*nb] = rlb>rla ? ai : strdup(ai);
+ if ( b[*nb] ) free(b[*nb]);
+ b[*nb] = const_ai ? strdup(ai) : ai;
(*nb)++;
}
return b;
}
-maux_t *maux_init(bcf_srs_t *files)
+maux_t *maux_init(args_t *args)
{
+ bcf_srs_t *files = args->files;
maux_t *ma = (maux_t*) calloc(1,sizeof(maux_t));
ma->n = files->nreaders;
- ma->nbuf = (int *) calloc(ma->n,sizeof(int));
- ma->d = (maux1_t**) calloc(ma->n,sizeof(maux1_t*));
ma->files = files;
int i, n_smpl = 0;
for (i=0; i<ma->n; i++)
n_smpl += bcf_hdr_nsamples(files->readers[i].header);
+ if ( args->do_gvcf )
+ {
+ ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
+ for (i=0; i<ma->n; i++)
+ ma->gvcf[i].line = bcf_init1();
+ }
ma->smpl_ploidy = (int*) calloc(n_smpl,sizeof(int));
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
- ma->has_line = (int*) malloc(ma->n*sizeof(int));
+ ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
+ for (i=0; i<ma->n; i++)
+ ma->buf[i].rid = -1;
return ma;
}
void maux_destroy(maux_t *ma)
{
- int i;
+ int i,j;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
for (i=0; i<ma->n; i++) // for each reader
{
- if ( !ma->d[i] ) continue;
- int j;
- for (j=0; j<ma->nbuf[i]; j++) // for each buffered line
- if ( ma->d[i][j].map ) free(ma->d[i][j].map);
- free(ma->d[i]);
+ for (j=0; j<ma->buf[i].mrec; j++) // for each buffered line
+ free(ma->buf[i].rec[j].map);
+ free(ma->buf[i].rec);
+ }
+ free(ma->buf);
+ if ( ma->gvcf )
+ {
+ for (i=0; i<ma->n; i++) bcf_destroy(ma->gvcf[i].line);
+ free(ma->gvcf);
}
for (i=0; i<ma->mAGR_info; i++)
free(ma->AGR_info[i].buf);
if (ma->ntmp_arr) free(ma->tmp_arr);
if (ma->nfmt_map) free(ma->fmt_map);
// ma->inf freed in bcf_destroy1
- free(ma->d);
- free(ma->nbuf);
for (i=0; i<ma->mals; i++) free(ma->als[i]);
if (ma->mout_als) free(ma->out_als);
free(ma->als);
free(ma->cnt);
free(ma->smpl_ploidy);
free(ma->smpl_nGsize);
- free(ma->has_line);
+ free(ma->chr);
free(ma);
}
-void maux_expand1(maux_t *ma, int i)
+void maux_expand1(buffer_t *buf, int size)
{
- if ( ma->nbuf[i] <= ma->files->readers[i].nbuffer )
+ if ( buf->mrec < size )
{
- int n = ma->files->readers[i].nbuffer + 1;
- ma->d[i] = (maux1_t*) realloc(ma->d[i], sizeof(maux1_t)*n);
- memset(ma->d[i]+ma->nbuf[i],0,sizeof(maux1_t)*(n-ma->nbuf[i]));
- ma->nbuf[i] = n;
+ hts_expand0(maux1_t,size,buf->mrec,buf->rec);
+ buf->mrec = size;
}
}
void maux_reset(maux_t *ma)
{
- int i;
- for (i=0; i<ma->n; i++) maux_expand1(ma, i);
- for (i=1; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ int i,j;
+ for (i=0; i<ma->n; i++) maux_expand1(&ma->buf[i],ma->files->readers[i].nbuffer+1);
+ for (i=0; i<ma->ncnt; i++) ma->cnt[i] = 0;
+ for (i=0; i<ma->mals; i++)
+ {
+ free(ma->als[i]);
+ ma->als[i] = NULL;
+ }
+ const char *chr = NULL;
+ ma->nals = 0;
+ ma->pos = -1;
+ for (i=0; i<ma->n; i++)
+ {
+ if ( !bcf_sr_has_line(ma->files,i) ) continue;
+ bcf1_t *line = bcf_sr_get_line(ma->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ chr = bcf_seqname(hdr,line);
+ ma->pos = line->pos;
+ break;
+ }
+ if ( chr )
+ {
+ free(ma->chr);
+ ma->chr = strdup(chr);
+ }
+ for (i=0; i<ma->n; i++)
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(ma->files,i);
+ ma->buf[i].rid = bcf_hdr_name2id(hdr,chr);
+ ma->buf[i].beg = bcf_sr_has_line(ma->files,i) ? 0 : 1;
+ for (j=ma->buf[i].beg; j<=ma->files->readers[i].nbuffer; j++)
+ {
+ ma->buf[i].rec[j].skip = 0;
+ bcf1_t *line = ma->files->readers[i].buffer[j];
+ if ( line->rid!=ma->buf[i].rid || line->pos!=ma->pos ) break;
+ }
+ ma->buf[i].end = j;
+ ma->buf[i].cur = -1;
+ if ( ma->buf[i].beg < ma->buf[i].end )
+ {
+ ma->buf[i].lines = ma->files->readers[i].buffer;
+ if ( ma->gvcf ) ma->gvcf[i].active = 0; // gvcf block cannot overlap with the next record
+ }
+ }
}
void maux_debug(maux_t *ma, int ir, int ib)
{
out->pos = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_unpack(line, BCF_UN_ALL);
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- // alleles
+ // not all maux alleles are always used, mark the ones we'll need
int j;
for (j=1; j<line->n_allele; j++)
- al_idxs[ ma->d[i][0].map[j] ] = 1;
+ {
+ int irec = ma->buf[i].cur;
+ al_idxs[ ma->buf[i].rec[irec].map[j] ] = 1;
+ }
// position
if ( out->pos==-1 )
}
// set QUAL to the max qual value. Not exactly correct, but good enough for now
- if ( !bcf_float_is_missing(files->readers[i].buffer[0]->qual) )
+ if ( !bcf_float_is_missing(line->qual) )
{
- if ( bcf_float_is_missing(out->qual) || out->qual < files->readers[i].buffer[0]->qual ) out->qual = files->readers[i].buffer[0]->qual;
+ if ( bcf_float_is_missing(out->qual) || out->qual < line->qual ) out->qual = line->qual;
}
}
// set ID
if ( !tmps->l ) kputs(".", tmps);
- if ( out->d.id ) free(out->d.id);
- out->d.id = strdup(tmps->s);
+ bcf_update_id(out_hdr, out, tmps->s);
// set alleles
ma->nout_als = 0;
int ir, j;
for (ir=0; ir<files->nreaders; ir++)
{
- if ( !ma->has_line[ir] ) continue;
- bcf1_t *line = files->readers[ir].buffer[0];
+ bcf1_t *line = maux_get_line(args,ir);
+ if ( !line ) continue;
for (j=1; j<line->n_allele; j++)
- if ( ma->d[ir][0].map[j]==i ) ma->d[ir][0].map[j] = ma->nout_als;
+ {
+ int irec = ma->buf[ir].cur;
+ if ( ma->buf[ir].rec[irec].map[j]==i ) ma->buf[ir].rec[irec].map[j] = ma->nout_als;
+ }
}
}
// Expand the arrays and realloc the alleles string. Note that all alleles are in a single allocated block.
bcf_hdr_t *out_hdr = args->out_hdr;
int i, ret;
+ if ( args->filter_logic == FLT_LOGIC_REMOVE )
+ {
+ for (i=0; i<files->nreaders; i++)
+ {
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
+ bcf_sr_t *reader = &files->readers[i];
+ bcf_hdr_t *hdr = reader->header;
+ if ( bcf_has_filter(hdr, line, "PASS") ) break;
+ }
+ if ( i<files->nreaders )
+ {
+ int flt_id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
+ bcf_add_filter(out_hdr, out, flt_id);
+ return;
+ }
+ }
+
khiter_t kitr;
strdict_t *tmph = args->tmph;
kh_clear(strdict, tmph);
- maux_t *ma = args->maux;
out->d.n_flt = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i]) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
- bcf_unpack(line, BCF_UN_ALL);
int k;
for (k=0; k<line->d.n_flt; k++)
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
if ( id==-1 ) error("Error: The filter is not defined in the header: %s\n", flt);
- hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
- ma->flt[out->d.n_flt] = id;
+ hts_expand(int,out->d.n_flt+1,out->d.m_flt,out->d.flt);
+ out->d.flt[out->d.n_flt] = id;
out->d.n_flt++;
kh_put(strdict, tmph, flt, &ret);
}
{
int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, "PASS");
for (i=0; i<out->d.n_flt; i++)
- if ( ma->flt[i]==id ) break;
+ if ( out->d.flt[i]==id ) break;
if ( i<out->d.n_flt )
{
out->d.n_flt--;
- for (; i<out->d.n_flt; i++) ma->flt[i] = ma->flt[i+1];
+ for (; i<out->d.n_flt; i++) out->d.flt[i] = out->d.flt[i+1];
}
}
- out->d.flt = ma->flt;
}
static void bcf_info_set_id(bcf1_t *line, bcf_info_t *info, int id, kstring_t *tmp_str)
{
- assert( !info->vptr_free );
-
uint8_t *ptr = info->vptr - info->vptr_off;
bcf_dec_typed_int1(ptr, &ptr);
kputsn_(info->vptr, info->len << bcf_type_shift[info->type], tmp_str);
info->vptr = (uint8_t*) tmp_str->s + info->vptr_off;
- info->vptr_free = 1;
- line->d.shared_dirty |= BCF1_DIRTY_INF;
tmp_str->s = NULL;
tmp_str->m = 0;
tmp_str->l = 0;
info_rules_reset(args);
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
+ int irec = ma->buf[i].cur;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_info; j++)
{
info_rule_t *rule = (info_rule_t*) bsearch(key, args->rules, args->nrules, sizeof(*args->rules), info_rules_comp_key);
if ( rule )
{
- maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->d[i][0] : NULL;
+ maux1_t *als = ( len==BCF_VL_A || len==BCF_VL_G || len==BCF_VL_R ) ? &ma->buf[i].rec[irec] : NULL;
if ( info_rules_add_values(args, hdr, line, rule, als, len) ) continue;
}
}
{
if ( kitr == kh_end(tmph) )
{
- // first occurance in this reader, alloc arrays
+ // seeing this key for the first time
ma->nAGR_info++;
hts_expand0(AGR_info_t,ma->nAGR_info,ma->mAGR_info,ma->AGR_info);
kitr = kh_put(strdict, tmph, key, &ret);
kitr = kh_get(strdict, tmph, key);
int idx = kh_val(tmph, kitr);
if ( idx<0 ) error("Error occurred while processing INFO tag \"%s\" at %s:%d\n", key,bcf_seqname(hdr,line),line->pos+1);
- merge_AGR_info_tag(hdr, line,inf,len,&ma->d[i][0],&ma->AGR_info[idx]);
+ merge_AGR_info_tag(hdr, line,inf,len,&ma->buf[i].rec[irec],&ma->AGR_info[idx]);
continue;
}
if ( kitr == kh_end(tmph) )
{
- hts_expand0(bcf_info_t,out->n_info+1,ma->minf,ma->inf);
- ma->inf[out->n_info].key = id;
- ma->inf[out->n_info].type = inf->type;
- ma->inf[out->n_info].len = inf->len;
- ma->inf[out->n_info].vptr = inf->vptr;
- ma->inf[out->n_info].v1.i = inf->v1.i;
- ma->inf[out->n_info].v1.f = inf->v1.f;
- ma->inf[out->n_info].vptr_off = inf->vptr_off;
- ma->inf[out->n_info].vptr_len = inf->vptr_len;
- ma->inf[out->n_info].vptr_free = inf->vptr_free;
+ // Seeing this key for the first time. Although quite hacky,
+ // this is faster than anything else given the data structures..
+
+ hts_expand0(bcf_info_t,out->n_info+1,out->d.m_info,out->d.info);
+ out->d.info[out->n_info].key = id;
+ out->d.info[out->n_info].type = inf->type;
+ out->d.info[out->n_info].len = inf->len;
+ out->d.info[out->n_info].v1.i = inf->v1.i;
+ out->d.info[out->n_info].v1.f = inf->v1.f;
+ out->d.info[out->n_info].vptr_off = inf->vptr_off;
+ out->d.info[out->n_info].vptr_len = inf->vptr_len;
+ out->d.info[out->n_info].vptr_free = 1;
+ out->d.info[out->n_info].vptr = (uint8_t*) malloc(inf->vptr_len+inf->vptr_off);
+ memcpy(out->d.info[out->n_info].vptr,inf->vptr-inf->vptr_off, inf->vptr_len+inf->vptr_off);
+ out->d.info[out->n_info].vptr += inf->vptr_off;
if ( (args->output_type & FT_BCF) && id!=bcf_hdr_id2int(hdr, BCF_DT_ID, key) )
- {
- // The existing packed info cannot be reused. Change the id.
- // Although quite hacky, it's faster than anything else given
- // the data structures
- bcf_info_set_id(out, &ma->inf[out->n_info], id, &args->tmps);
- }
+ bcf_info_set_id(out, &out->d.info[out->n_info], id, &args->tmps);
+ out->d.shared_dirty |= BCF1_DIRTY_INF;
out->n_info++;
kitr = kh_put(strdict, tmph, key, &ret);
kh_val(tmph,kitr) = -(out->n_info-1); // arbitrary negative value
}
}
}
- out->d.info = ma->inf;
- out->d.m_info = ma->minf;
for (i=0; i<args->nrules; i++)
args->rules[i].merger(args->out_hdr, out, &args->rules[i]);
for (i=0; i<ma->nAGR_info; i++)
}
memset(ma->smpl_ploidy,0,nsamples*sizeof(int));
+ int default_gt = args->missing_to_ref ? bcf_gt_unphased(0) : bcf_gt_missing;
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
int32_t *tmp = (int32_t *) ma->tmp_arr + ismpl*nsize;
+ int irec = ma->buf[i].cur;
int j, k;
if ( !fmt_ori )
// missing values: assume maximum ploidy
for (j=0; j<bcf_hdr_nsamples(hdr); j++)
{
- for (k=0; k<nsize; k++) { tmp[k] = 0; ma->smpl_ploidy[ismpl+j]++; }
+ for (k=0; k<nsize; k++) { tmp[k] = default_gt; ma->smpl_ploidy[ismpl+j]++; }
tmp += nsize;
}
ismpl += bcf_hdr_nsamples(hdr);
#define BRANCH(type_t, vector_end) { \
type_t *p_ori = (type_t*) fmt_ori->p; \
- if ( !ma->d[i][0].als_differ ) \
+ if ( !ma->buf[i].rec[irec].als_differ ) \
{ \
/* the allele numbering is unchanged */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
else \
{ \
int al = (p_ori[k]>>1) - 1; \
- al = al<=0 ? al + 1 : ma->d[i][0].map[al] + 1; \
+ al = al<=0 ? al + 1 : ma->buf[i].rec[irec].map[al] + 1; \
tmp[k] = (al << 1) | ((p_ori[k])&1); \
} \
} \
int nsize = 0, length = BCF_VL_FIXED, type = -1;
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ if ( !maux_get_line(args,i) ) continue;
if ( !fmt_map[i] ) continue;
if ( !key ) key = files->readers[i].header->id[BCF_DT_ID][fmt_map[i]->id].key;
type = fmt_map[i]->type;
bcf_sr_t *reader = &files->readers[i];
bcf_hdr_t *hdr = reader->header;
bcf_fmt_t *fmt_ori = fmt_map[i];
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = ma->buf[i].cur;
if ( fmt_ori )
{
type = fmt_ori->type;
- int nals_ori = reader->buffer[0]->n_allele;
+ int nals_ori = line->n_allele;
if ( length==BCF_VL_G )
{
// if all fields are missing then n==1 is valid
ismpl += bcf_hdr_nsamples(hdr); \
continue; \
} \
- assert( ma->has_line[i] ); \
- bcf1_t *line = reader->buffer[0]; \
src_type_t *src = (src_type_t*) fmt_ori->p; \
- if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->d[i][0].als_differ) ) \
+ if ( (length!=BCF_VL_G && length!=BCF_VL_A && length!=BCF_VL_R) || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) ) \
{ \
/* alleles unchanged, copy over */ \
for (j=0; j<bcf_hdr_nsamples(hdr); j++) \
int iori, inew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + iori; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
int iori,jori, inew,jnew; \
for (iori=0; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori]; \
+ inew = ma->buf[i].rec[irec].map[iori]; \
for (jori=0; jori<=iori; jori++) \
{ \
- jnew = ma->d[i][0].map[jori]; \
+ jnew = ma->buf[i].rec[irec].map[jori]; \
int kori = iori*(iori+1)/2 + jori; \
int knew = inew>jnew ? inew*(inew+1)/2 + jnew : jnew*(jnew+1)/2 + inew; \
src = (src_type_t*) fmt_ori->p + j*fmt_ori->n + kori; \
int iori,inew; \
for (iori=ifrom; iori<line->n_allele; iori++) \
{ \
- inew = ma->d[i][0].map[iori] - ifrom; \
+ inew = ma->buf[i].rec[irec].map[iori] - ifrom; \
tgt = (tgt_type_t *) ma->tmp_arr + (ismpl+j)*nsize + inew; \
if ( src_is_vector_end ) break; \
if ( src_is_missing ) tgt_set_missing; \
int i, j, ret, has_GT = 0, max_ifmt = 0; // max fmt index
for (i=0; i<files->nreaders; i++)
{
- if ( !ma->has_line[i] ) continue;
+ bcf1_t *line = maux_get_line(args,i);
+ if ( !line ) continue;
bcf_sr_t *reader = &files->readers[i];
- bcf1_t *line = reader->buffer[0];
bcf_hdr_t *hdr = reader->header;
for (j=0; j<line->n_fmt; j++)
{
ma->fmt_map[ifmt*files->nreaders+i] = fmt;
}
// Check if the allele numbering must be changed
- for (j=1; j<reader->buffer[0]->n_allele; j++)
- if ( ma->d[i][0].map[j]!=j ) break;
- ma->d[i][0].als_differ = j==reader->buffer[0]->n_allele ? 0 : 1;
+ int irec = ma->buf[i].cur;
+ for (j=1; j<line->n_allele; j++)
+ if ( ma->buf[i].rec[irec].map[j]!=j ) break;
+ ma->buf[i].rec[irec].als_differ = j==line->n_allele ? 0 : 1;
}
out->n_sample = bcf_hdr_nsamples(out_hdr);
merge_GT(args, ma->fmt_map, out);
update_AN_AC(out_hdr, out);
- if ( out->d.info!=ma->inf )
- {
- // hacky, we rely on htslib internals: bcf_update_info() reallocated the info
- ma->inf = out->d.info;
- ma->minf = out->d.m_info;
- }
-
for (i=1; i<=max_ifmt; i++)
merge_format_field(args, &ma->fmt_map[i*files->nreaders], out);
out->d.indiv_dirty = 1;
}
-// The core merging function, one or none line from each reader
-void merge_line(args_t *args)
+void gvcf_set_alleles(args_t *args)
+{
+ int i,k;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ maux->nals = 0;
+
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ bcf1_t *line = maux_get_line(args, i);
+ int irec = maux->buf[i].cur;
+
+ hts_expand(int, line->n_allele, maux->buf[i].rec[irec].mmap, maux->buf[i].rec[irec].map);
+ if ( !maux->nals ) // first record, copy the alleles to the output
+ {
+ maux->nals = line->n_allele;
+ hts_expand0(char*, maux->nals, maux->mals, maux->als);
+ hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
+ for (k=0; k<maux->nals; k++)
+ {
+ if ( maux->als[k] ) free(maux->als[k]);
+ maux->als[k] = strdup(line->d.allele[k]);
+ maux->buf[i].rec[irec].map[k] = k;
+ }
+ }
+ else
+ {
+ maux->als = merge_alleles(line->d.allele, line->n_allele, maux->buf[i].rec[irec].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als )
+ {
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,i);
+ error("Failed to merge alleles at %s:%d\n",bcf_seqname(hdr,line),line->pos+1);
+ }
+ }
+ }
+}
+
+/*
+ Output staged gVCF blocks, end is the last position of the block. Assuming
+ gaux[i].active flags are set and maux_get_line returns correct lines.
+*/
+void gvcf_write_block(args_t *args, int start, int end)
{
+ int i;
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ assert(gaux);
+
+ // Update POS
+ int min = INT_MAX;
+ char ref = 'N';
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( ref=='N' && gaux[i].line->pos==start ) ref = gaux[i].line->d.allele[0][0];
+ gaux[i].line->pos = start;
+ }
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < start )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ gaux[i].line->d.allele[0][0] = ref;
+ if ( min > gaux[i].end ) min = gaux[i].end;
+ }
+ // Check for valid gVCF blocks in this region
+ if ( min==INT_MAX )
+ {
+ assert(0);
+ maux->gvcf_min = 0;
+ return;
+ }
+
bcf1_t *out = args->out_line;
- bcf_clear1(out);
- out->unpacked = BCF_UN_ALL;
+ gvcf_set_alleles(args);
+
+ // Merge the staged lines
merge_chrom2qual(args, out);
merge_filter(args, out);
merge_info(args, out);
merge_format(args, out);
- bcf_write1(args->out_fh, args->out_hdr, out);
-}
+ if ( args->gvcf_fai && out->d.allele[0][0]=='N' )
+ {
+ int slen = 0;
+ char *seq = faidx_fetch_seq(args->gvcf_fai,maux->chr,out->pos,out->pos,&slen);
+ if (slen)
+ {
+ out->d.allele[0][0] = seq[0];
+ free(seq);
+ }
+ }
+ // Update END boundary
+ if ( end > start )
+ {
+ end++;
+ bcf_update_info_int32(args->out_hdr, out, "END", &end, 1);
+ }
+ else
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
-void debug_buffers(FILE *fp, bcf_srs_t *files);
-void debug_buffer(FILE *fp, bcf_sr_t *reader);
-#define SWAP(type_t,a,b) { type_t tmp = (a); (a) = (b); (b) = tmp; }
+ // Inactivate blocks which do not extend beyond END and find new gvcf_min
+ min = INT_MAX;
+ for (i=0; i<args->files->nreaders; i++)
+ {
+ if ( !gaux[i].active ) continue;
+ if ( gaux[i].end < end )
+ {
+ gaux[i].active = 0;
+ maux->buf[i].cur = -1;
+ continue;
+ }
+ // next min END position bigger than the current one
+ if ( maux->gvcf_min < gaux[i].end+1 && min > gaux[i].end+1 ) min = gaux[i].end + 1;
+ }
+ maux->gvcf_min = min==INT_MAX ? 0 : min;
+}
-// Clean the reader's buffer to and make it ready for the next next_line() call.
-// Moves finished records (SKIP_DONE flag set) at the end of the buffer and put
-// the rest to the beggining. Then shorten the buffer so that the last element
-// points to the last unfinished record. There are two special cases: the last
-// line of the buffer typically has a different position and must stay at the
-// end; next, the first record of the buffer must be one of those already
-// printed, as it will be discarded by next_line().
-//
-void shake_buffer(maux_t *maux, int ir, int pos)
+/*
+ Flush staged gVCF blocks. Flush everything if there are no more lines
+ (done=1) or if there is a new chromosome. If still on the same chromosome,
+ all hanging blocks must be ended by creating new records:
+ A
+ 1 END=10
+ B
+ 3 END=7
+ C
+ 3 END=5
+ out
+ 1 END=2 A . .
+ 3 END=5 A B C
+ 6 END=7 A B .
+ 8 END=10 A . .
+
+*/
+void gvcf_flush(args_t *args, int done)
{
- bcf_sr_t *reader = &maux->files->readers[ir];
- maux1_t *m = maux->d[ir];
-
- if ( !reader->buffer ) return;
-
int i;
- // FILE *fp = pysam_stdout;
- // fprintf(fp,"<going to shake> nbuf=%d\t", reader->nbuffer); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]); fprintf(fp,"\n");
- // debug_buffer(fp,reader);
- // fprintf(fp,"--\n");
+ maux_t *maux = args->maux;
- int a = 1, b = reader->nbuffer;
- if ( reader->buffer[b]->pos != pos ) b--; // move the last line separately afterwards
+ if ( !maux->chr ) return; // first time here, nothing to flush
- while ( a<b )
+ int flush_until = INT_MAX;
+ if ( !done )
{
- if ( !(m[a].skip&SKIP_DONE) ) { a++; continue; }
- if ( m[b].skip&SKIP_DONE ) { b--; continue; }
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[b]);
- SWAP(maux1_t, m[a], m[b]);
- a++;
- b--;
- }
+ // Get current position and chromosome
+ for (i=0; i<maux->n; i++)
+ if ( bcf_sr_has_line(maux->files,i) ) break;
+ bcf1_t *line = bcf_sr_get_line(maux->files,i);
+ bcf_hdr_t *hdr = bcf_sr_get_header(maux->files,i);
- // position $a to the after the first unfinished record
- while ( a<=reader->nbuffer && !(m[a].skip&SKIP_DONE) ) a++;
+ if ( !strcmp(maux->chr,bcf_seqname(hdr,line)) ) flush_until = line->pos; // still on the same chr
+ }
- if ( a<reader->nbuffer )
+ // When called on a region, trim the blocks accordingly
+ int start = maux->gvcf_break>=0 ? maux->gvcf_break + 1 : maux->pos;
+ if ( args->regs )
{
- // there is a gap between the unfinished lines at the beggining and the
- // last line. The last line must be brought forward to fill the gap
- if ( reader->buffer[reader->nbuffer]->pos != pos )
+ int rstart = -1, rend = -1;
+ if ( regidx_overlap(args->regs,maux->chr,start,flush_until,args->regs_itr) )
{
- SWAP(bcf1_t*, reader->buffer[a], reader->buffer[reader->nbuffer]);
- SWAP(maux1_t, m[a], m[reader->nbuffer]);
- reader->nbuffer = a;
+ // In case there are multiple regions, we treat them as one
+ rstart = args->regs_itr->beg;
+ while ( regitr_overlap(args->regs_itr) ) rend = args->regs_itr->end;
}
+ if ( rstart > start ) start = rstart;
+ if ( rend < flush_until ) flush_until = rend+1;
}
- if ( !(m[0].skip&SKIP_DONE) && reader->buffer[0]->pos==pos )
+ // output all finished blocks
+ while ( maux->gvcf_min && start < flush_until )
{
- // the first record is unfinished, replace it with an empty line
- // from the end of the buffer or else next_line will remove it
- if ( reader->nbuffer + 1 >= maux->nbuf[ir] )
+ // does the block end before the new line or is it interrupted?
+ int tmp = maux->gvcf_min < flush_until ? maux->gvcf_min : flush_until;
+ if ( start > tmp-1 ) break;
+ gvcf_write_block(args,start,tmp-1); // gvcf_min is 1-based
+ start = tmp;
+ }
+}
+
+/*
+ Check incoming lines for new gVCF blocks, set pointer to the current source
+ buffer (gvcf or readers). In contrast to gvcf_flush, this function can be
+ called only after maux_reset as it relies on updated maux buffers.
+*/
+void gvcf_stage(args_t *args, int pos)
+{
+ maux_t *maux = args->maux;
+ gvcf_aux_t *gaux = maux->gvcf;
+ bcf_srs_t *files = args->files;
+ int32_t *end = (int32_t*) maux->tmp_arr;
+ int i, nend = maux->ntmp_arr / sizeof(int32_t);
+
+ maux->gvcf_break = -1;
+ maux->gvcf_min = INT_MAX;
+ for (i=0; i<files->nreaders; i++)
+ {
+ if ( gaux[i].active )
{
- reader->nbuffer++;
- maux_expand1(maux, ir);
- reader->nbuffer--;
- m = maux->d[ir];
+ // gvcf block should not overlap with another record
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+ continue;
}
- if ( reader->nbuffer+1 >= reader->mbuffer )
- error("Uh, did not expect this: %d vs %d\n", reader->nbuffer,reader->mbuffer);
- if ( reader->buffer[reader->nbuffer]->pos!=pos )
+ // Does any of the lines have END set? It is enough to check only the
+ // first line, there should be no duplicate records with END in gVCF
+
+ if ( maux->buf[i].beg==maux->buf[i].end ) continue; // no new record
+
+ int irec = maux->buf[i].beg;
+ bcf_hdr_t *hdr = bcf_sr_get_header(files, i);
+ bcf1_t *line = args->files->readers[i].buffer[irec];
+ int ret = bcf_get_info_int32(hdr,line,"END",&end,&nend);
+ if ( ret==1 )
{
- // 4way swap
- bcf1_t *tmp = reader->buffer[0];
- reader->buffer[0] = reader->buffer[reader->nbuffer+1];
- reader->buffer[reader->nbuffer+1] = reader->buffer[reader->nbuffer];
- reader->buffer[reader->nbuffer] = tmp;
- m[reader->nbuffer].skip = m[0].skip;
- m[reader->nbuffer+1].skip = SKIP_DIFF;
- reader->nbuffer++;
+ // END is set, this is a new gVCF block. Cache this line in gaux[i] and swap with
+ // an empty record: the gaux line must be kept until we reach its END.
+ gaux[i].active = 1;
+ gaux[i].end = end[0] - 1;
+ SWAP(bcf1_t*,args->files->readers[i].buffer[irec],gaux[i].line);
+ gaux[i].line->pos = pos;
+
+ maux->buf[i].lines = &gaux[i].line;
+ maux->buf[i].beg = 0;
+ maux->buf[i].end = 1;
+ maux->buf[i].cur = 0;
+
+ // Set the rid,pos of the swapped line in the buffer or else the
+ // synced reader will have a problem with the next line
+ //
+ args->files->readers[i].buffer[irec]->rid = maux->buf[i].rid;
+ args->files->readers[i].buffer[irec]->pos = maux->pos;
+
+ // Update block offsets
+ if ( maux->gvcf_min > gaux[i].end+1 ) maux->gvcf_min = gaux[i].end + 1;
}
else
- {
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[reader->nbuffer+1]);
- SWAP(maux1_t, m[0], m[reader->nbuffer+1]);
- }
+ maux->gvcf_break = line->pos; // must break the gvcf block
}
+ maux->ntmp_arr = nend * sizeof(int32_t);
+ maux->tmp_arr = end;
+ if ( maux->gvcf_min==INT_MAX ) maux->gvcf_min = 0;
+}
+
+
+void debug_buffers(FILE *fp, bcf_srs_t *files);
+void debug_buffer(FILE *fp, bcf_srs_t *files, int reader);
+
+/*
+ Flush all buffered and processed records with the same coordinate.
+ Note that synced reader discards buffer[0], so that needs to stay
+ untouched.
+*/
+void clean_buffer(args_t *args)
+{
+ maux_t *ma = args->maux;
+
+ int ir;
+ for (ir=0; ir<ma->n; ir++)
+ {
+ // Invalidate pointer to reader's buffer or else gvcf_flush will attempt
+ // to use the old lines via maux_get_line()
+ if ( ma->gvcf && !ma->gvcf[ir].active ) ma->buf[ir].cur = -1;
- // debug_buffer(fp,reader);
- // fprintf(fp,"<shaken>\t"); for (i=0; i<reader->nbuffer; i++) fprintf(fp," %d", skip[i]);
- // fprintf(fp,"\n\n");
+ bcf_sr_t *reader = bcf_sr_get_reader(args->files,ir);
+ if ( !reader->nbuffer ) continue; // nothing to clean
- // set position of finished buffer[0] line to -1, otherwise swapping may
- // bring it back after next_line()
- reader->buffer[0]->pos = -1;
+ bcf1_t **buf = reader->buffer;
+ if ( buf[1]->rid!=ma->buf[ir].rid || buf[1]->pos!=ma->pos ) continue; // nothing to flush
- // trim the buffer, remove finished lines from the end
- i = reader->nbuffer;
- while ( i>=1 && m[i--].skip&SKIP_DONE )
- reader->nbuffer--;
+ int a = 1, b = 2;
+ while ( b<=reader->nbuffer && buf[b]->rid==ma->buf[ir].rid && buf[b]->pos==ma->pos ) b++;
+ // b now points to the first line we want to preserve
+ while ( b<=reader->nbuffer )
+ {
+ SWAP(bcf1_t*, buf[a], buf[b]);
+ a++; b++;
+ }
+ reader->nbuffer -= b-a;
+ }
}
-void debug_maux(args_t *args, int pos, int var_type)
+void debug_maux(args_t *args)
{
bcf_srs_t *files = args->files;
maux_t *maux = args->maux;
int j,k,l;
- fprintf(pysam_stderr,"Alleles to merge at %d\n", pos+1);
+ fprintf(pysam_stderr,"Alleles to merge at %d, nals=%d\n", maux->pos+1,maux->nals);
for (j=0; j<files->nreaders; j++)
{
bcf_sr_t *reader = &files->readers[j];
+ buffer_t *buf = &maux->buf[j];
fprintf(pysam_stderr," reader %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
+ for (k=buf->beg; k<buf->end; k++)
{
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
+ if ( buf->rec[k].skip & SKIP_DONE ) continue;
bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
fprintf(pysam_stderr,"\t");
- if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round
+ if ( buf->rec[k].skip ) fprintf(pysam_stderr,"["); // this record will not be merged in this round
for (l=0; l<line->n_allele; l++)
fprintf(pysam_stderr,"%s%s", l==0?"":",", line->d.allele[l]);
- if ( maux->d[j][k].skip ) fprintf(pysam_stderr,"]");
+ if ( buf->rec[k].skip ) fprintf(pysam_stderr,"]");
}
fprintf(pysam_stderr,"\n");
}
fprintf(pysam_stderr," counts: ");
- for (j=0; j<maux->nals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]); fprintf(pysam_stderr,"\n");
- for (j=0; j<files->nreaders; j++)
- {
- bcf_sr_t *reader = &files->readers[j];
- fprintf(pysam_stderr," out %d: ", j);
- for (k=0; k<=reader->nbuffer; k++)
- {
- if ( maux->d[j][k].skip==SKIP_DONE ) continue;
- bcf1_t *line = reader->buffer[k];
- if ( line->pos!=pos ) continue;
- if ( maux->d[j][k].skip ) continue;
- fprintf(pysam_stderr,"\t");
- for (l=0; l<line->n_allele; l++)
- fprintf(pysam_stderr,"%s%s", l==0?"":",", maux->als[maux->d[j][k].map[l]]);
- }
- fprintf(pysam_stderr,"\n");
- }
- fprintf(pysam_stderr,"\n");
+ for (j=0; j<maux->nals; j++) fprintf(pysam_stderr,"%s %dx %s", j==0?"":",",maux->cnt[j], maux->als[j]);
+ fprintf(pysam_stderr,"\n\n");
}
-// Determine which line should be merged from which reader: go through all
-// readers and all buffered lines, expand REF,ALT and try to match lines with
-// the same ALTs. A step towards output independent on input ordering of the
-// lines.
-void merge_buffer(args_t *args)
+
+/*
+ Determine which line should be merged from which reader: go through all
+ readers and all buffered lines, expand REF,ALT and try to match lines with
+ the same ALTs.
+ */
+int can_merge(args_t *args)
{
bcf_srs_t *files = args->files;
- int i, pos = -1, var_type = 0;
- char *id = NULL;
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
maux_t *maux = args->maux;
- maux_reset(maux);
+ gvcf_aux_t *gaux = maux->gvcf;
+ char *id = NULL, ref = 'N';
+ maux->var_types = maux->nals = 0;
- // set the current position
+ int i,j,k, ntodo = 0;
for (i=0; i<files->nreaders; i++)
{
- if ( bcf_sr_has_line(files,i) )
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = bcf_sr_get_line(files,i);
- pos = line->pos;
- var_type = bcf_get_variant_types(line);
- id = line->d.id;
- break;
+ // skip readers with active gvcf blocks
+ buf->rec[buf->beg].skip = SKIP_DIFF;
+ continue;
+ }
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ buf->rec[j].skip = SKIP_DIFF;
+ ntodo++;
+
+ if ( args->merge_by_id )
+ id = buf->lines[j]->d.id;
+ else
+ {
+ int var_type = bcf_get_variant_types(buf->lines[j]);
+ maux->var_types |= var_type ? var_type<<1 : 1;
+ }
}
+
+ // for gvcf: find out REF at this position
+ if ( buf->beg < buf->end && ref=='N' )
+ ref = buf->lines[buf->beg]->d.allele[0][0];
}
+ if ( !ntodo ) return 0;
// In this loop we select from each reader compatible candidate lines.
// (i.e. SNPs or indels). Go through all files and all lines at this
for (i=0; i<files->nreaders; i++)
{
bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
- int j, k;
- for (j=0; j<=reader->nbuffer; j++)
+ buffer_t *buf = &maux->buf[i];
+
+ if ( gaux && gaux[i].active )
{
- bcf1_t *line = reader->buffer[j];
+ gaux[i].line->d.allele[0][0] = ref;
+ gaux[i].line->pos = maux->pos;
+ }
+
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip & SKIP_DONE ) continue;
+
+ bcf1_t *line = buf->lines[j]; // ptr to reader's buffer or gvcf buffer
+
int line_type = bcf_get_variant_types(line);
+ line_type = line_type ? line_type<<1 : 1;
+
// select relevant lines
- maux->d[i][j].skip = SKIP_DIFF;
- if ( pos!=line->pos )
- {
- if ( j==0 ) maux->d[i][j].skip |= SKIP_DONE; // left from previous run, force to ignore
- continue;
- }
if ( args->merge_by_id )
{
if ( strcmp(id,line->d.id) ) continue;
{
// All alleles of the tested record must be present in the
// selected maux record plus variant types must be the same
- if ( var_type!=line->d.var_type ) continue;
+ if ( (maux->var_types & line_type) != line_type ) continue;
if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible
for (k=1; k<line->n_allele; k++)
{
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break;
}
- if ( k==line->n_allele ) continue; // no matching allele
+ if ( !(line_type&ref_mask) && k==line->n_allele ) continue; // not a REF-only site and there is no matching allele
}
if ( !(args->collapse&COLLAPSE_ANY) )
{
- int compatible = 0;
- if ( line_type==var_type ) compatible = 1;
- else if ( line_type==VCF_REF ) compatible = 1; // REF can go with anything
- else if ( var_type&VCF_SNP && line_type&VCF_SNP ) compatible = 1;
- else if ( var_type&VCF_INDEL && line_type&VCF_INDEL ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_SNP && line_type&VCF_MNP ) compatible = 1;
- else if ( var_type&VCF_MNP && line_type&VCF_SNP ) compatible = 1;
- if ( !compatible ) continue;
+ // Merge:
+ // - SNPs+SNPs+MNPs+REF if -m both,snps
+ // - indels+indels+REF if -m both,indels, REF only if SNPs are not present
+ // - SNPs come first
+ if ( line_type & indel_mask )
+ {
+ if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
+ if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
+ }
}
}
- maux->d[i][j].skip = 0;
+ buf->rec[j].skip = 0;
- hts_expand(int, line->n_allele, maux->d[i][j].mmap, maux->d[i][j].map);
+ hts_expand(int, line->n_allele, buf->rec[j].mmap, buf->rec[j].map);
if ( !maux->nals ) // first record, copy the alleles to the output
{
maux->nals = line->n_allele;
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=0; k<maux->nals; k++)
{
+ free(maux->als[k]);
maux->als[k] = strdup(line->d.allele[k]);
- maux->d[i][j].map[k] = k;
+ buf->rec[j].map[k] = k;
maux->cnt[k] = 1;
}
- pos = line->pos;
continue;
}
-
// normalize alleles
- maux->als = merge_alleles(line->d.allele, line->n_allele, maux->d[i][j].map, maux->als, &maux->nals, &maux->mals);
- if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(bcf_sr_get_header(args->files,j),line),line->pos+1,reader->fname);
+ maux->als = merge_alleles(line->d.allele, line->n_allele, buf->rec[j].map, maux->als, &maux->nals, &maux->mals);
+ if ( !maux->als ) error("Failed to merge alleles at %s:%d in %s\n",bcf_seqname(args->out_hdr,line),line->pos+1,reader->fname);
hts_expand0(int, maux->nals, maux->ncnt, maux->cnt);
for (k=1; k<line->n_allele; k++)
- maux->cnt[ maux->d[i][j].map[k] ]++; // how many times an allele appears in the files
+ maux->cnt[ buf->rec[j].map[k] ]++; // how many times an allele appears in the files
maux->cnt[0]++;
}
}
+ return 1;
+}
- // debug_maux(args, pos, var_type);
+/*
+ Select records that have the same alleles; the input ordering of indels
+ must not matter. Multiple VCF lines can be emitted from this loop.
+ We expect only very few alleles and not many records with the same
+ position in the buffers, therefore the nested loops should not slow us
+ much.
+*/
+void stage_line(args_t *args)
+{
+ int snp_mask = (VCF_SNP<<1)|(VCF_MNP<<1), indel_mask = VCF_INDEL<<1, ref_mask = 1;
+ bcf_srs_t *files = args->files;
+ maux_t *maux = args->maux;
- // Select records that have the same alleles; the input ordering of indels
- // must not matter. Multiple VCF lines can be emitted from this loop.
- // We expect only very few alleles and not many records with the same
- // position in the buffers, therefore the nested loops should not slow us
- // much.
- while (1)
+ // debug_maux(args);
+
+ // take the most frequent allele present in multiple files, REF is skipped
+ int i,j,k,icnt = 1;
+ for (i=2; i<maux->nals; i++)
+ if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
+
+ int nout = 0;
+ for (i=0; i<files->nreaders; i++)
{
- // take the most frequent allele present in multiple files
- int icnt = 0;
- for (i=1; i<maux->nals; i++)
- if ( maux->cnt[i] > maux->cnt[icnt] ) icnt = i;
- if ( maux->cnt[icnt]<0 ) break;
+ buffer_t *buf = &maux->buf[i];
+ buf->cur = -1;
+ if ( buf->beg >= buf->end ) continue; // no lines in the buffer
- int nmask = 0;
- for (i=0; i<files->nreaders; i++)
+ // find lines with the same allele
+ for (j=buf->beg; j<buf->end; j++)
{
- maux->has_line[i] = 0;
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->merge_by_id ) break;
+ if ( maux->nals==1 && buf->lines[j]->n_allele==1 ) break; // REF-only record
- bcf_sr_t *reader = &files->readers[i];
- if ( !reader->buffer ) continue;
+ for (k=0; k<buf->lines[j]->n_allele; k++)
+ if ( icnt==buf->rec[j].map[k] ) break;
- // find lines with the same allele
- int j;
- for (j=0; j<=reader->nbuffer; j++)
- {
- if ( maux->d[i][j].skip ) continue;
- int k;
- for (k=0; k<reader->buffer[j]->n_allele; k++)
- if ( icnt==maux->d[i][j].map[k] ) break;
- if ( k<reader->buffer[j]->n_allele ) break;
- }
- if ( j>reader->nbuffer )
- {
- // no matching allele found in this file
- if ( args->collapse==COLLAPSE_NONE ) continue;
+ if ( k<buf->lines[j]->n_allele ) break;
+ }
+ if ( j>=buf->end )
+ {
+ // no matching allele found in this file
+ if ( args->collapse==COLLAPSE_NONE ) continue;
- for (j=0; j<=reader->nbuffer; j++)
+ for (j=buf->beg; j<buf->end; j++)
+ {
+ if ( buf->rec[j].skip ) continue; // done or not compatible
+ if ( args->collapse&COLLAPSE_ANY ) break; // anything can be merged
+ int line_type = bcf_get_variant_types(buf->lines[j]);
+ if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( line_type==VCF_REF )
{
- if ( maux->d[i][j].skip ) continue;
- if ( args->collapse&COLLAPSE_ANY ) break;
- int line_type = bcf_get_variant_types(reader->buffer[j]);
- if ( var_type&VCF_SNP && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- if ( line_type==VCF_REF )
- {
- if ( var_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( var_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
- else if ( var_type==VCF_REF )
- {
- if ( line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
- if ( line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
- }
+ if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
+ if ( maux->var_types&ref_mask ) break;
}
- }
- if ( j<=reader->nbuffer )
- {
- // found a suitable line for merging, place it at the beggining
- if ( j>0 )
+ else if ( maux->var_types&ref_mask )
{
- SWAP(bcf1_t*, reader->buffer[0], reader->buffer[j]);
- SWAP(maux1_t, maux->d[i][0], maux->d[i][j]);
+ if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
+ if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
}
- // mark as finished so that it's ignored next time
- maux->d[i][0].skip |= SKIP_DONE;
- maux->has_line[i] = 1;
- nmask++;
}
}
- if ( !nmask ) break; // done, no more lines suitable for merging found
- merge_line(args); // merge and output the line
- maux->cnt[icnt] = -1; // do not pick this allele again, mark it as finished
+ if ( j<buf->end )
+ {
+ // found a suitable line for merging
+ buf->cur = j;
+
+ // mark as finished so that it's ignored next time
+ buf->rec[j].skip = SKIP_DONE;
+ nout++;
+ }
}
+ assert( nout );
+}
- // clean the alleles
- for (i=0; i<maux->nals; i++)
+void merge_line(args_t *args)
+{
+ if ( args->regs )
{
- free(maux->als[i]);
- maux->als[i] = 0;
+ if ( !regidx_overlap(args->regs,args->maux->chr,args->maux->pos,args->maux->pos,NULL) ) return;
}
- maux->nals = 0;
- // get the buffers ready for the next next_line() call
- for (i=0; i<files->nreaders; i++)
- shake_buffer(maux, i, pos);
+ bcf1_t *out = args->out_line;
+ merge_chrom2qual(args, out);
+ merge_filter(args, out);
+ merge_info(args, out);
+ if ( args->do_gvcf )
+ bcf_update_info_int32(args->out_hdr, out, "END", NULL, 0);
+ merge_format(args, out);
+ bcf_write1(args->out_fh, args->out_hdr, out);
+ bcf_clear1(out);
}
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd)
else
ksprintf(&str, " %s", argv[i]);
}
+ kputs("; Date=", &str);
+ time_t tm; time(&tm); kputs(ctime(&tm), &str);
kputc('\n', &str);
bcf_hdr_append(hdr,str.s);
free(str.s);
{
args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);
+ if ( args->n_threads ) hts_set_opt(args->out_fh, HTS_OPT_THREAD_POOL, args->files->p); //hts_set_threads(args->out_fh, args->n_threads);
args->out_hdr = bcf_hdr_init("w");
if ( args->header_fname )
}
if ( args->collapse==COLLAPSE_NONE ) args->vcmp = vcmp_init();
- args->maux = maux_init(args->files);
+ args->maux = maux_init(args);
args->out_line = bcf_init1();
args->tmph = kh_init(strdict);
- int ret;
- while ( (ret=bcf_sr_next_line(args->files)) )
+
+ while ( bcf_sr_next_line(args->files) )
{
- merge_buffer(args);
+ // output cached gVCF blocks which end before the new record
+ if ( args->do_gvcf )
+ gvcf_flush(args,0);
+
+ maux_reset(args->maux);
+
+ // determine which of the new records are gvcf blocks
+ if ( args->do_gvcf )
+ gvcf_stage(args, args->maux->pos);
+
+ while ( can_merge(args) )
+ {
+ stage_line(args);
+ merge_line(args);
+ }
+ clean_buffer(args);
}
+ if ( args->do_gvcf )
+ gvcf_flush(args,1);
+
info_rules_destroy(args);
maux_destroy(args->maux);
bcf_hdr_destroy(args->out_hdr);
fprintf(pysam_stderr, " --force-samples resolve duplicate sample names\n");
fprintf(pysam_stderr, " --print-header print only the merged header and exit\n");
fprintf(pysam_stderr, " --use-header <file> use the provided header\n");
+ fprintf(pysam_stderr, " -0 --missing-to-ref assume genotypes at missing sites are 0/0\n");
fprintf(pysam_stderr, " -f, --apply-filters <list> require at least one of the listed FILTER strings (e.g. \"PASS,.\")\n");
+ fprintf(pysam_stderr, " -F, --filter-logic <x|+> remove filters if some input is PASS (\"x\"), or apply all filters (\"+\") [+]\n");
+ fprintf(pysam_stderr, " -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max\n");
fprintf(pysam_stderr, " -i, --info-rules <tag:method,..> rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
fprintf(pysam_stderr, " -l, --file-list <file> read file names from the file\n");
fprintf(pysam_stderr, " -m, --merge <string> allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
{
{"help",no_argument,NULL,'h'},
{"merge",required_argument,NULL,'m'},
+ {"gvcf",required_argument,NULL,'g'},
{"file-list",required_argument,NULL,'l'},
+ {"missing-to-ref",no_argument,NULL,'0'},
{"apply-filters",required_argument,NULL,'f'},
{"use-header",required_argument,NULL,1},
{"print-header",no_argument,NULL,2},
{"regions-file",required_argument,NULL,'R'},
{"info-rules",required_argument,NULL,'i'},
{"no-version",no_argument,NULL,8},
+ {"filter-logic",required_argument,NULL,'F'},
{NULL,0,NULL,0}
};
- while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0",loptions,NULL)) >= 0) {
switch (c) {
+ case 'F':
+ if ( !strcmp(optarg,"+") ) args->filter_logic = FLT_LOGIC_ADD;
+ else if ( !strcmp(optarg,"x") ) args->filter_logic = FLT_LOGIC_REMOVE;
+ else error("Filter logic not recognised: %s\n", optarg);
+ break;
+ case '0': args->missing_to_ref = 1; break;
+ case 'g':
+ args->do_gvcf = 1;
+ if ( strcmp("-",optarg) )
+ {
+ args->gvcf_fai = fai_load(optarg);
+ if ( !args->gvcf_fai ) error("Failed to load the fai index: %s\n", optarg);
+ }
+ break;
case 'l': args->file_list = optarg; break;
case 'i': args->info_rules = optarg; break;
case 'o': args->output_fname = optarg; break;
if ( argc-optind<2 && !args->file_list ) usage();
args->files->require_index = 1;
- if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
- error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->regions_list )
+ {
+ if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
+ error("Failed to read the regions: %s\n", args->regions_list);
+ if ( regions_is_file )
+ args->regs = regidx_init(args->regions_list,NULL,NULL,sizeof(char*),NULL);
+ else
+ {
+ args->regs = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
+ if ( regidx_insert_list(args->regs,args->regions_list,',') !=0 ) error("Could not parse the regions: %s\n", args->regions_list);
+ regidx_insert(args->regs,NULL);
+ }
+ if ( !args->regs ) error("Could not parse the regions: %s\n", args->regions_list);
+ args->regs_itr = regitr_init(args->regs);
+ }
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
while (optind<argc)
{
if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
}
merge_vcf(args);
bcf_sr_destroy(args->files);
+ if ( args->regs ) regidx_destroy(args->regs);
+ if ( args->regs_itr ) regitr_destroy(args->regs_itr);
+ if ( args->gvcf_fai ) fai_destroy(args->gvcf_fai);
free(args);
return 0;
}
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
for (i=0; i<nseq; i++)
{
char c = toupper(seq[i]);
- if ( c!='A' && c!='C' && c!='G' && c!='T' ) { seq[i] = 'N'; n++; }
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) { seq[i] = 'N'; n++; }
}
return n;
}
+static inline int has_non_acgtn(char *seq, int nseq)
+{
+ char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ while ( *seq && seq<end )
+ {
+ char c = toupper(*seq);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 1;
+ seq++;
+ }
+ return 0;
+}
static void fix_ref(args_t *args, bcf1_t *line)
{
if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
}
-#define ERR_DUP_ALLELE -2
-#define ERR_REF_MISMATCH -1
-#define ERR_OK 0
-#define ERR_SYMBOLIC 1
+#define ERR_DUP_ALLELE -2
+#define ERR_REF_MISMATCH -1
+#define ERR_OK 0
+#define ERR_SYMBOLIC 1
+#define ERR_SPANNING_DELETION 2
static int realign(args_t *args, bcf1_t *line)
{
int i, nref, reflen = strlen(line->d.allele[0]);
char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
- replace_iupac_codes(ref,nref);
+ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
- // does REF contain non-standard bases?
- if ( replace_iupac_codes(line->d.allele[0],reflen) )
+ // does VCF REF contain non-standard bases?
+ if ( has_non_acgtn(line->d.allele[0],reflen) )
{
- args->nchanged++;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ free(ref);
+ return ERR_REF_MISMATCH;
}
if ( strcasecmp(ref,line->d.allele[0]) )
{
for (i=0; i<line->n_allele; i++)
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+ if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
+ if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error
+ if ( has_non_acgtn(line->d.allele[i],0) )
+ {
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]);
+ return ERR_REF_MISMATCH;
+ }
als[i].l = 0;
kputs(line->d.allele[i], &als[i]);
int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
if ( len==BCF_VL_A ) \
{ \
- assert( ret==src->n_allele-1); \
+ if ( ret!=src->n_allele-1 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \
bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( ret==src->n_allele); \
+ if ( ret!=src->n_allele ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \
if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
} \
else if ( len==BCF_VL_G ) \
{ \
- assert( ret==src->n_allele*(src->n_allele+1)/2 ); \
+ if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \
if ( ialt!=0 ) \
{ \
vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
} \
if ( len==BCF_VL_A ) \
{ \
- assert( nvals==(src->n_allele-1)*nsmpl); \
+ if ( nvals!=(src->n_allele-1)*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( nvals==src->n_allele*nsmpl); \
+ if ( nvals!=src->n_allele*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
if ( *se==',' ) nfields++;
se++;
}
- assert( nfields==src->n_allele*(src->n_allele+1)/2 || nfields==src->n_allele );
+ if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele )
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n",
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields);
+
int len = 0;
if ( nfields==src->n_allele ) // haploid
{
else
{
int ial = bcf_gt_allele(gt2[k]);
- assert( ial<args->maps[i].nals );
+ if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial);
gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
}
}
{
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(out, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
fprintf(stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
fprintf(stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
fprintf(stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
- fprintf(stderr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(stderr, " -f, --fasta-ref <file> reference sequence (MANDATORY)\n");
fprintf(stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(stderr, " --no-version do not append version and command line to the header\n");
fprintf(stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
fprintf(stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(stderr, "\n");
exit(1);
error("Failed to read the targets: %s\n", args->targets);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
init_data(args);
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
for (i=0; i<nseq; i++)
{
char c = toupper(seq[i]);
- if ( c!='A' && c!='C' && c!='G' && c!='T' ) { seq[i] = 'N'; n++; }
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) { seq[i] = 'N'; n++; }
}
return n;
}
+static inline int has_non_acgtn(char *seq, int nseq)
+{
+ char *end = nseq ? seq + nseq : seq + UINT32_MAX; // arbitrary large number
+ while ( *seq && seq<end )
+ {
+ char c = toupper(*seq);
+ if ( c!='A' && c!='C' && c!='G' && c!='T' && c!='N' ) return 1;
+ seq++;
+ }
+ return 0;
+}
static void fix_ref(args_t *args, bcf1_t *line)
{
if ( changed ) bcf_update_genotypes(args->hdr,line,gts,ngts);
}
-#define ERR_DUP_ALLELE -2
-#define ERR_REF_MISMATCH -1
-#define ERR_OK 0
-#define ERR_SYMBOLIC 1
+#define ERR_DUP_ALLELE -2
+#define ERR_REF_MISMATCH -1
+#define ERR_OK 0
+#define ERR_SYMBOLIC 1
+#define ERR_SPANNING_DELETION 2
static int realign(args_t *args, bcf1_t *line)
{
int i, nref, reflen = strlen(line->d.allele[0]);
char *ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
if ( !ref ) error("faidx_fetch_seq failed at %s:%d\n", args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
- replace_iupac_codes(ref,nref);
+ replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
- // does REF contain non-standard bases?
- if ( replace_iupac_codes(line->d.allele[0],reflen) )
+ // does VCF REF contain non-standard bases?
+ if ( has_non_acgtn(line->d.allele[0],reflen) )
{
- args->nchanged++;
- bcf_update_alleles(args->hdr,line,(const char**)line->d.allele,line->n_allele);
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN reference allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[0]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(pysam_stderr,"NON_ACGTN_REF\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
+ free(ref);
+ return ERR_REF_MISMATCH;
}
if ( strcasecmp(ref,line->d.allele[0]) )
{
for (i=0; i<line->n_allele; i++)
{
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
+ if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
+ if ( bcf_get_variant_type(line,i)==VCF_BND ) return ERR_SYMBOLIC; // breakend, not an error
+ if ( has_non_acgtn(line->d.allele[i],0) )
+ {
+ if ( args->check_ref==CHECK_REF_EXIT )
+ error("Non-ACGTN alternate allele at %s:%d .. REF_SEQ:'%s' vs VCF:'%s'\n", bcf_seqname(args->hdr,line),line->pos+1,ref,line->d.allele[i]);
+ if ( args->check_ref & CHECK_REF_WARN )
+ fprintf(pysam_stderr,"NON_ACGTN_ALT\t%s\t%d\t%s\n", bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[i]);
+ return ERR_REF_MISMATCH;
+ }
als[i].l = 0;
kputs(line->d.allele[i], &als[i]);
int len = bcf_hdr_id2length(args->hdr,BCF_HL_INFO,info->key); \
if ( len==BCF_VL_A ) \
{ \
- assert( ret==src->n_allele-1); \
+ if ( ret!=src->n_allele-1 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele-1,ret); \
bcf_update_info_##type(args->hdr,dst,tag,vals+ialt,1); \
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( ret==src->n_allele); \
+ if ( ret!=src->n_allele ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele,ret); \
if ( ialt!=0 ) vals[1] = vals[ialt+1]; \
bcf_update_info_##type(args->hdr,dst,tag,vals,2); \
} \
else if ( len==BCF_VL_G ) \
{ \
- assert( ret==src->n_allele*(src->n_allele+1)/2 ); \
+ if ( ret!=src->n_allele*(src->n_allele+1)/2 ) \
+ error("Error: wrong number of fields in INFO/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,ret); \
if ( ialt!=0 ) \
{ \
vals[1] = vals[bcf_alleles2gt(0,ialt+1)]; \
} \
if ( len==BCF_VL_A ) \
{ \
- assert( nvals==(src->n_allele-1)*nsmpl); \
+ if ( nvals!=(src->n_allele-1)*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,(src->n_allele-1)*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
} \
else if ( len==BCF_VL_R ) \
{ \
- assert( nvals==src->n_allele*nsmpl); \
+ if ( nvals!=src->n_allele*nsmpl ) \
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d, found %d\n", \
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*nsmpl,nvals); \
nvals /= nsmpl; \
type_t *src_vals = vals, *dst_vals = vals; \
for (i=0; i<nsmpl; i++) \
if ( *se==',' ) nfields++;
se++;
}
- assert( nfields==src->n_allele*(src->n_allele+1)/2 || nfields==src->n_allele );
+ if ( nfields!=src->n_allele*(src->n_allele+1)/2 && nfields!=src->n_allele )
+ error("Error: wrong number of fields in FMT/%s at %s:%d, expected %d or %d, found %d\n",
+ tag,bcf_seqname(args->hdr,src),src->pos+1,src->n_allele*(src->n_allele+1)/2,src->n_allele,nfields);
+
int len = 0;
if ( nfields==src->n_allele ) // haploid
{
else
{
int ial = bcf_gt_allele(gt2[k]);
- assert( ial<args->maps[i].nals );
+ if ( ial>=args->maps[i].nals ) error("Error at %s:%d: incorrect allele index %d\n",bcf_seqname(args->hdr,lines[i]),lines[i]->pos+1,ial);
gt[k] = bcf_gt_unphased( args->maps[i].map[ial] ) | bcf_gt_is_phased(gt[k]);
}
}
{
htsFile *out = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
if ( out == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
- if ( args->n_threads ) hts_set_threads(out, args->n_threads);
+ if ( args->n_threads )
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, args->files->p);
if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_norm");
bcf_hdr_write(out, args->hdr);
fprintf(pysam_stderr, " -c, --check-ref <e|w|x|s> check REF alleles and exit (e), warn (w), exclude (x), or set (s) bad sites [e]\n");
fprintf(pysam_stderr, " -D, --remove-duplicates remove duplicate lines of the same type.\n");
fprintf(pysam_stderr, " -d, --rm-dup <type> remove duplicate snps|indels|both|any\n");
- fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence\n");
+ fprintf(pysam_stderr, " -f, --fasta-ref <file> reference sequence (MANDATORY)\n");
fprintf(pysam_stderr, " -m, --multiallelics <-|+>[type] split multiallelics (-) or join biallelics (+), type: snps|indels|both|any [both]\n");
fprintf(pysam_stderr, " --no-version do not append version and command line to the header\n");
fprintf(pysam_stderr, " -N, --do-not-normalize do not normalize indels (with -m or -c s)\n");
fprintf(pysam_stderr, " -s, --strict-filter when merging (-m+), merged site is PASS only if all sites being merged PASS\n");
fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
- fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(pysam_stderr, " -w, --site-win <int> buffer for sorting lines which changed position during realignment [1000]\n");
fprintf(pysam_stderr, "\n");
exit(1);
error("Failed to read the targets: %s\n", args->targets);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
if ( args->mrows_op&MROWS_SPLIT && args->rmdup ) error("Cannot combine -D and -m-\n");
init_data(args);
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
* Plugin API:
* ----------
* const char *about(void)
- * - short description used by 'bcftools plugin -l'
+ * - short description used by 'bcftools plugin -lv'
*
* const char *usage(void)
* - longer description used by 'bcftools +name -h'
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. ok\n", dir);
}
else
{
- if ( args->verbose ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ if ( args->verbose > 1 ) fprintf(stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
free(dir);
}
{
tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
else fprintf(stderr,"%s:\n\tdlopen .. ok\n", tmp);
}
handle = dlopen(fname, RTLD_NOW);
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
else fprintf(stderr,"%s:\n\tdlopen .. ok\n", fname);
if ( ret )
plugin->init = NULL;
else
- if ( args->verbose ) fprintf(stderr,"\tinit .. ok\n");
+ if ( args->verbose > 1 ) fprintf(stderr,"\tinit .. ok\n");
plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
ret = dlerror();
if ( ret )
plugin->run = NULL;
else
- if ( args->verbose ) fprintf(stderr,"\trun .. ok\n");
+ if ( args->verbose > 1 ) fprintf(stderr,"\trun .. ok\n");
if ( !plugin->init && !plugin->run )
{
if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
- else if ( args->verbose ) fprintf(stderr,"\tinit/run .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(stderr,"\tinit/run .. not found\n");
return -1;
}
if ( ret )
{
if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
- else if ( args->verbose ) fprintf(stderr,"\tversion .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(stderr,"\tversion .. not found\n");
return -1;
}
qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
for (i=0; i<nplugins; i++)
- printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
- printf("\n");
+ {
+ if ( args->verbose )
+ printf("\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ else
+ printf("%s\n", plugins[i].name);
+ }
+ if ( args->verbose ) printf("\n");
}
else
print_plugin_usage_hint();
fprintf(stderr, "Plugin options:\n");
fprintf(stderr, " -h, --help list plugin's options\n");
fprintf(stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(stderr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
fprintf(stderr, " -V, --version print version string and exit\n");
fprintf(stderr, "\n");
exit(1);
}
+static int is_verbose(int argc, char *argv[])
+{
+ int c, verbose = 0, opterr_ori = opterr;
+ static struct option loptions[] =
+ {
+ {"verbose",no_argument,NULL,'v'},
+ {NULL,0,NULL,0}
+ };
+ opterr = 0;
+ while ((c = getopt_long(argc, argv, "-v",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'v': verbose++; break;
+ case 1:
+ default: break;
+ }
+ }
+ opterr = opterr_ori;
+ optind = 0;
+ return verbose;
+}
int main_plugin(int argc, char *argv[])
{
int c;
char *plugin_name = NULL;
if ( argv[1][0]!='-' )
{
+ args->verbose = is_verbose(argc, argv);
plugin_name = argv[1];
argc--;
argv++;
{
switch (c) {
case 'V': version_only = 1; break;
- case 'v': args->verbose = 1; break;
+ case 'v': args->verbose++; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
/* vcfplugin.c -- plugin modules for operating on VCF/BCF files.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
* Plugin API:
* ----------
* const char *about(void)
- * - short description used by 'bcftools plugin -l'
+ * - short description used by 'bcftools plugin -lv'
*
* const char *usage(void)
* - longer description used by 'bcftools +name -h'
args->plugin_paths = (char**) realloc(args->plugin_paths,sizeof(char*)*(args->nplugin_paths+1));
args->plugin_paths[args->nplugin_paths] = dir;
args->nplugin_paths++;
- if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir);
+ if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. ok\n", dir);
}
else
{
- if ( args->verbose ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
+ if ( args->verbose > 1 ) fprintf(pysam_stderr, "plugin directory %s .. %s\n", dir, strerror(errno));
free(dir);
}
{
tmp = msprintf("%s/%s.so", args->plugin_paths[i],fname);
handle = dlopen(tmp, RTLD_NOW); // valgrind complains about unfreed memory, not our problem though
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", tmp,dlerror());
else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", tmp);
}
handle = dlopen(fname, RTLD_NOW);
- if ( args->verbose )
+ if ( args->verbose > 1 )
{
if ( !handle ) fprintf(pysam_stderr,"%s:\n\tdlopen .. %s\n", fname,dlerror());
else fprintf(pysam_stderr,"%s:\n\tdlopen .. ok\n", fname);
if ( ret )
plugin->init = NULL;
else
- if ( args->verbose ) fprintf(pysam_stderr,"\tinit .. ok\n");
+ if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit .. ok\n");
plugin->run = (dl_run_f) dlsym(plugin->handle, "run");
ret = dlerror();
if ( ret )
plugin->run = NULL;
else
- if ( args->verbose ) fprintf(pysam_stderr,"\trun .. ok\n");
+ if ( args->verbose > 1 ) fprintf(pysam_stderr,"\trun .. ok\n");
if ( !plugin->init && !plugin->run )
{
if ( exit_on_error ) error("Could not initialize %s, neither run or init found \n", plugin->name);
- else if ( args->verbose ) fprintf(pysam_stderr,"\tinit/run .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tinit/run .. not found\n");
return -1;
}
if ( ret )
{
if ( exit_on_error ) error("Could not initialize %s, version string not found\n", plugin->name);
- else if ( args->verbose ) fprintf(pysam_stderr,"\tversion .. not found\n");
+ else if ( args->verbose > 1 ) fprintf(pysam_stderr,"\tversion .. not found\n");
return -1;
}
qsort(plugins, nplugins, sizeof(plugins[0]), cmp_plugin_name);
for (i=0; i<nplugins; i++)
- fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about());
- fprintf(pysam_stdout, "\n");
+ {
+ if ( args->verbose )
+ fprintf(pysam_stdout, "\n-- %s --\n%s", plugins[i].name, plugins[i].about());
+ else
+ fprintf(pysam_stdout, "%s\n", plugins[i].name);
+ }
+ if ( args->verbose ) fprintf(pysam_stdout, "\n");
}
else
print_plugin_usage_hint();
fprintf(pysam_stderr, "Plugin options:\n");
fprintf(pysam_stderr, " -h, --help list plugin's options\n");
fprintf(pysam_stderr, " -l, --list-plugins list available plugins. See BCFTOOLS_PLUGINS environment variable and man page for details\n");
- fprintf(pysam_stderr, " -v, --verbose print debugging information on plugin failure\n");
+ fprintf(pysam_stderr, " -v, --verbose print verbose information, -vv increases verbosity\n");
fprintf(pysam_stderr, " -V, --version print version string and exit\n");
fprintf(pysam_stderr, "\n");
exit(1);
}
+static int is_verbose(int argc, char *argv[])
+{
+ int c, verbose = 0, opterr_ori = opterr;
+ static struct option loptions[] =
+ {
+ {"verbose",no_argument,NULL,'v'},
+ {NULL,0,NULL,0}
+ };
+ opterr = 0;
+ while ((c = getopt_long(argc, argv, "-v",loptions,NULL)) >= 0)
+ {
+ switch (c) {
+ case 'v': verbose++; break;
+ case 1:
+ default: break;
+ }
+ }
+ opterr = opterr_ori;
+ optind = 0;
+ return verbose;
+}
int main_plugin(int argc, char *argv[])
{
int c;
char *plugin_name = NULL;
if ( argv[1][0]!='-' )
{
+ args->verbose = is_verbose(argc, argv);
plugin_name = argv[1];
argc--;
argv++;
{
switch (c) {
case 'V': version_only = 1; break;
- case 'v': args->verbose = 1; break;
+ case 'v': args->verbose++; break;
case 'o': args->output_fname = optarg; break;
case 'O':
switch (optarg[0]) {
#include <htslib/synced_bcf_reader.h>
#include <htslib/kstring.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
#include "bcftools.h"
#include "HMM.h"
+#include "smpl_ilist.h"
#define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies
#define STATE_AZ 1 // autozygous state
+#define OUTPUT_ST (1<<1)
+#define OUTPUT_RG (1<<2)
+#define OUTPUT_GZ (1<<3)
+
/** Genetic map */
typedef struct
{
}
genmap_t;
+/** HMM data for each sample */
+typedef struct
+{
+ double *eprob; // emission probs [2*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+ int igenmap; // current position in genmap
+ int nused; // some stats to detect if things didn't go wrong
+ int nrid, *rid, *rid_off; // for viterbi training, keep all chromosomes
+ void *snapshot; // hmm snapshot
+ struct {
+ uint32_t beg,end,nqual;
+ double qual;
+ int rid, state;
+ } rg;
+}
+smpl_t;
+
typedef struct _args_t
{
bcf_srs_t *files;
double rec_rate; // constant recombination rate if > 0
hmm_t *hmm;
- double *eprob; // emission probs [2*nsites,msites]
- uint32_t *sites; // positions [nsites,msites]
- int nsites, msites;
+ double baum_welch_th;
int nrids, *rids, *rid_offs; // multiple chroms with vi_training
+ int nbuf_max, nbuf_olap;
- int32_t *itmp;
- int nitmp, mitmp;
float *AFs;
- int mAFs;
+ int32_t *itmp;
+ int mAFs, nitmp, mitmp, pl_hdr_id, gt_hdr_id;
double pl2p[256], *pdg;
int32_t skip_rid, prev_rid, prev_pos;
- int ntot, nused; // some stats to detect if things didn't go awfully wrong
- int ismpl, nsmpl; // index of query sample
- char *estimate_AF, *sample; // list of samples for AF estimate and query sample
- char **argv, *targets_list, *regions_list, *af_fname, *af_tag;
- int argc, fake_PLs, snps_only, vi_training;
+ int ntot; // some stats to detect if things didn't go wrong
+ smpl_t *smpl; // HMM data for each sample
+ smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF)
+ smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file)
+ char *estimate_AF; // list of samples for AF estimate and query sample
+ int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT
+ char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
+ int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+ BGZF *out;
+ kstring_t str;
}
args_t;
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
void *smalloc(size_t size)
{
static void init_data(args_t *args)
{
+ int i;
+
args->prev_rid = args->skip_rid = -1;
args->hdr = args->files->readers[0].header;
- if ( !args->sample )
- {
- if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
- args->sample = strdup(args->hdr->samples[0]);
- }
if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
- // Set samples
- kstring_t str = {0,0,0};
- if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
+ if ( !args->fake_PLs )
{
- int i, n;
- char **smpls = hts_readlist(args->estimate_AF, 1, &n);
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n");
+ if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT )
+ error("Error: The FORMAT/PL tag not defined as Integer in the header\n");
+ }
- // Make sure the query sample is included
- for (i=0; i<n; i++)
- if ( !strcmp(args->sample,smpls[i]) ) break;
+ if ( args->estimate_AF )
+ {
+ if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3;
+ else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; }
+ if ( strcmp("-",args->estimate_AF) )
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
- // Add the query sample if not present
- if ( i!=n ) kputs(args->sample, &str);
+ if ( args->estimate_AF || args->fake_PLs )
+ {
+ if ( args->af_from_PL )
+ {
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header\n");
+ }
+ else
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
+ }
+ if ( args->fake_PLs )
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
- for (i=0; i<n; i++)
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+ if ( args->samples )
+ {
+ // we may be able to subset to a few samples, for a text VCF this can be a major speedup
+ if ( (bcf_sr_get_reader(args->files,0))->file->format.format==vcf )
{
- if ( str.l ) kputc(',', &str);
- kputs(smpls[i], &str);
- free(smpls[i]);
+ kstring_t str = {0,0,0};
+ smpl_ilist_t *tmp = args->roh_smpl, *rmme = NULL;
+ if ( args->af_smpl )
+ {
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[args->roh_smpl->idx[i]], &str);
+ }
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ kputc(',', &str);
+ kputs(args->hdr->samples[args->af_smpl->idx[i]], &str);
+ }
+ rmme = tmp = smpl_ilist_init(args->hdr, str.s, 0, SMPL_NONE);
+ }
+ if ( tmp->n < bcf_hdr_nsamples(args->hdr) )
+ {
+ str.l = 0;
+ for (i=0; i<tmp->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[tmp->idx[i]], &str);
+ }
+ int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
+ else if ( ret>0 ) error("The %d-th sample not found in the VCF: %s\n", ret,str.s);
+
+ // update sample ids
+ smpl_ilist_destroy(args->roh_smpl);
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+
+ if ( args->af_smpl )
+ {
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
+ }
+ free(str.s);
+ if ( rmme )
+ smpl_ilist_destroy(rmme);
}
- free(smpls);
}
- else if ( !args->estimate_AF )
- kputs(args->sample, &str);
- if ( str.l )
+ // check whether all samples are in this list. If so, the lookup will not be needed
+ if ( args->af_smpl && args->af_smpl->n == bcf_hdr_nsamples(args->hdr) )
{
- int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
- if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
- else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
+ // all samples are in this list
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = NULL;
}
- if ( args->af_tag )
- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
- error("No such INFO tag in the VCF: %s\n", args->af_tag);
+ if ( args->buffer_size )
+ {
+ args->nbuf_olap = -1;
+ char *end;
+ double tmp = strtod(args->buffer_size,&end);
+ if ( *end )
+ {
+ if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size);
+ args->nbuf_olap = strtol(end+1,&end,10);
+ if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size);
+ }
+ if ( tmp<0 )
+ args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n;
+ else
+ args->nbuf_max = tmp;
- args->nsmpl = bcf_hdr_nsamples(args->hdr);
- args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
- free(str.s);
+ if ( args->nbuf_olap<0 )
+ args->nbuf_olap = args->nbuf_max*0.01;
+ }
+ fprintf(stderr,"Number of target samples: %d\n", args->roh_smpl->n);
+ fprintf(stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0));
+ fprintf(stderr,"Number of sites in the buffer/overlap: ");
+ if ( args->nbuf_max ) fprintf(stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap);
+ else fprintf(stderr,"unlimited\n");
+
+ args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t));
- int i;
for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);
// Init transition matrix and HMM
MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ args->hmm = hmm_init(2, tprob, 10000);
if ( args->genmap_fname )
- {
- args->hmm = hmm_init(2, tprob, 0);
hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
- }
else if ( args->rec_rate > 0 )
- {
- args->hmm = hmm_init(2, tprob, 0);
- hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);
+ hmm_set_tprob_func(args->hmm, set_tprob_rrate, args);
- }
- else
- args->hmm = hmm_init(2, tprob, 10000);
+ args->out = bgzf_open(strcmp("stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu");
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno));
// print header
- printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
- printf("# The command line was:\tbcftools %s", args->argv[0]);
+ args->str.l = 0;
+ ksprintf(&args->str, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ ksprintf(&args->str, "# The command line was:\tbcftools %s", args->argv[0]);
for (i=1; i<args->argc; i++)
- printf(" %s",args->argv[i]);
- printf("\n#\n");
- printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+ ksprintf(&args->str, " %s",args->argv[i]);
+ ksprintf(&args->str, "\n#\n");
+ if ( args->output_type & OUTPUT_RG )
+ {
+ i = 2;
+ ksprintf(&args->str, "# RG");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Start", i++);
+ ksprintf(&args->str, "\t[%d]End", i++);
+ ksprintf(&args->str, "\t[%d]Length (bp)", i++);
+ ksprintf(&args->str, "\t[%d]Number of markers", i++);
+ ksprintf(&args->str, "\t[%d]Quality (average fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->output_type & OUTPUT_ST )
+ {
+ i = 2;
+ ksprintf(&args->str, "# ST");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Position", i++);
+ ksprintf(&args->str, "\t[%d]State (0:HW, 1:AZ)", i++);
+ ksprintf(&args->str, "\t[%d]Quality (fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->vi_training)
+ {
+ i = 2;
+ ksprintf(&args->str, "# VT, Viterbi Training");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Iteration", i++);
+ ksprintf(&args->str, "\t[%d]dAZ", i++);
+ ksprintf(&args->str, "\t[%d]dHW", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(HW|HW)", i++);
+ ksprintf(&args->str, "\t[%d]P(AZ|HW)", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(AZ|AZ)", i++);
+ ksprintf(&args->str, "\t[%d]P(HW|AZ)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l )
+ error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
static void destroy_data(args_t *args)
{
- free(args->sites);
- free(args->eprob);
- free(args->sample);
+ if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname);
+ int i;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ free(args->smpl[i].eprob);
+ free(args->smpl[i].sites);
+ free(args->smpl[i].rid);
+ free(args->smpl[i].rid_off);
+ free(args->smpl[i].snapshot);
+ }
+ free(args->str.s);
+ free(args->smpl);
+ if ( args->af_smpl ) smpl_ilist_destroy(args->af_smpl);
+ smpl_ilist_destroy(args->roh_smpl);
free(args->rids);
free(args->rid_offs);
hmm_destroy(args->hmm);
bcf_sr_destroy(args->files);
- free(args->itmp); free(args->AFs); free(args->pdg);
+ free(args->AFs); free(args->pdg);
free(args->genmap);
+ free(args->itmp);
+ free(args->samples);
}
static int load_genmap(args_t *args, bcf1_t *line)
hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
genmap_t *gm = &args->genmap[args->ngenmap-1];
+ // position, convert to 0-based
char *tmp, *end;
gm->pos = strtol(str.s, &tmp, 10);
if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->pos -= 1;
// skip second column
tmp++;
while ( *tmp && !isspace(*tmp) ) tmp++;
- // read the genetic map in cM
+ // read the genetic map in cM, scale from % to likelihood
gm->rate = strtod(tmp+1, &end);
if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->rate *= 0.01;
}
if ( !args->ngenmap ) error("Genetic map empty?\n");
- int i;
- for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
if ( hts_close(fp) ) error("Close failed\n");
free(str.s);
return 0;
// position j to be equal or larger than end
int j = i;
while ( j+1<args->ngenmap && args->genmap[j].pos < end ) j++;
-
if ( i==j )
{
args->igenmap = i;
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
- double ci = get_genmap_rate(args, pos - prev_pos, pos);
+ double ci = get_genmap_rate(args, prev_pos, pos);
+ if ( args->rec_rate ) ci *= args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
}
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
double ci = (pos - prev_pos) * args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
*
*/
-static void flush_viterbi(args_t *args)
+static void flush_viterbi(args_t *args, int ismpl)
{
- int i,j;
+ smpl_t *smpl = &args->smpl[ismpl];
+ if ( !smpl->nsites ) return;
- if ( !args->nsites ) return;
+ const char *name = args->hdr->samples[ args->roh_smpl->idx[ismpl] ];
- if ( !args->vi_training )
+ int i,j,k;
+
+ if ( !args->vi_training ) // single viterbi pass
{
- // single viterbi pass, one chromsome
- hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
- hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
+ hmm_restore(args->hmm, smpl->snapshot);
+ int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
+ if ( end < smpl->nsites )
+ smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1);
+
+ args->igenmap = smpl->igenmap;
+ hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
+ hmm_run_fwd_bwd(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (i=0; i<args->nsites; i++)
+ const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+
+ for (i=0; i<end; i++)
{
int state = vpath[i*2]==STATE_AZ ? 1 : 0;
- double *pval = fwd + i*2;
- printf("%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
- }
- return;
- }
+ double qual = phred_score(1.0 - fwd[i*2 + state]);
+ if ( args->output_type & OUTPUT_ST )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "ST\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[i]+1, state, qual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ }
- // viterbi training, multiple chromosomes
- double t2az_prev, t2hw_prev;
- double deltaz, delthw;
- int niter = 0;
- do
- {
- double *tprob_arr = hmm_get_tprob(args->hmm);
- t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
- t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
- double tcounts[] = { 0,0,0,0 };
- for (i=0; i<args->nrids; i++)
- {
- // run viterbi for each chromosomes. eprob and sites contain
- // multiple chromosomes, rid_offs mark the boundaries
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
-
- // what transitions were observed: add to the total counts
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (j=1; j<nsites; j++)
+ if ( args->output_type & OUTPUT_RG )
{
- // count the number of transitions
- int prev_state = vpath[2*(j-1)];
- int curr_state = vpath[2*j];
- MAT(tcounts,2,curr_state,prev_state) += 1;
+ if ( state!=smpl->rg.state )
+ {
+ if ( !state ) // the region ends, flush
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
+ }
+ else
+ {
+ smpl->rg.state = 1;
+ smpl->rg.beg = smpl->sites[i];
+ smpl->rg.rid = args->prev_rid;
+ }
+ }
+ else if ( state )
+ {
+ smpl->rg.nqual++;
+ smpl->rg.qual += qual;
+ smpl->rg.end = smpl->sites[i];
+ }
}
}
- // update the transition matrix
- int n = 1;
- for (i=0; i<2; i++)
+ if ( end < smpl->nsites )
{
- for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
+ end = smpl->nsites - args->nbuf_olap;
+ memmove(smpl->sites, smpl->sites + end, sizeof(*smpl->sites)*args->nbuf_olap);
+ memmove(smpl->eprob, smpl->eprob + end*2, sizeof(*smpl->eprob)*args->nbuf_olap*2);
+ smpl->nsites = args->nbuf_olap;
+ smpl->igenmap = args->igenmap;
}
- for (i=0; i<2; i++)
+ else
{
- for (j=0; j<2; j++)
+ smpl->nsites = 0;
+ smpl->igenmap = 0;
+
+ if ( smpl->rg.state )
{
- // no transition to i-th state was observed, set to a small number
- if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
- else MAT(tcounts,2,i,j) /= n;
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
}
}
- // normalize
- for (i=0; i<2; i++)
+ return;
+ }
+
+
+ // viterbi training, multiple chromosomes
+ double t2az_prev, t2hw_prev;
+ double deltaz, delthw;
+
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
+ MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW;
+ MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ;
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ hmm_set_tprob(args->hmm, tprob_arr, 10000);
+
+ int niter = 0;
+ do
+ {
+ tprob_arr = hmm_get_tprob(args->hmm);
+ t2az_prev = MAT(tprob_arr,2,STATE_AZ,STATE_HW); //args->t2AZ;
+ t2hw_prev = MAT(tprob_arr,2,STATE_HW,STATE_AZ); //args->t2HW;
+ double tprob_new[] = { 0,0,0,0 };
+ for (i=0; i<smpl->nrid; i++)
{
- double norm = 0;
- for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
- assert( norm!=0 );
- for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ tprob_arr = hmm_run_baum_welch(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) += MAT(tprob_arr,2,j,k);
}
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) /= smpl->nrid;
- if ( args->genmap_fname || args->rec_rate > 0 )
- hmm_set_tprob(args->hmm, tcounts, 0);
- else
- hmm_set_tprob(args->hmm, tcounts, 10000);
+ hmm_set_tprob(args->hmm, tprob_new, 10000);
- tprob_arr = hmm_get_tprob(args->hmm);
- deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
- delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
+ deltaz = fabs(MAT(tprob_new,2,STATE_AZ,STATE_HW)-t2az_prev);
+ delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev);
niter++;
- fprintf(stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
- niter,deltaz,delthw,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ args->str.l = 0;
+ ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n",
+ name,niter,deltaz,delthw,
+ 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW),
+ 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
- while ( deltaz > 0.0 || delthw > 0.0 );
- double *tprob_arr = hmm_get_tprob(args->hmm);
- fprintf(stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th );
// output the results
- for (i=0; i<args->nrids; i++)
+ for (i=0; i<smpl->nrid; i++)
{
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
- hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ hmm_run_viterbi(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
+ const char *chr = bcf_hdr_id2name(args->hdr,smpl->rid[i]);
for (j=0; j<nsites; j++)
{
- int state = vpath[j*2];
- double pval = fwd[j*2 + state];
- printf("%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
+ int state = vpath[j*2]==STATE_AZ ? 1 : 0;
+ double *pval = fwd + j*2;
+ args->str.l = 0;
+ ksprintf(&args->str, "ROH\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[ioff+j]+1, state, phred_score(1.0-pval[state]));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
}
}
-static void push_rid(args_t *args, int rid)
-{
- args->nrids++;
- args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int));
- args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int));
- args->rids[ args->nrids-1 ] = rid;
- args->rid_offs[ args->nrids-1 ] = args->nsites;
-}
int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
{
return 0;
}
-int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
+int8_t *get_GT(args_t *args, bcf1_t *line)
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->gt_hdr_id ) break;
+ if ( i==line->n_fmt ) return NULL; // the tag is not present in this record
+
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( fmt->n!=2 ) return NULL; // not diploid
+ if ( fmt->type!=BCF_BT_INT8 ) error("This is unexpected, GT type is %d\n", fmt->type);
+ return (int8_t*) fmt->p;
+}
+
+int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq)
+{
int i, nalt = 0, nref = 0;
- for (i=0; i<args->nsmpl; i++)
+ if ( args->af_smpl ) // subset samples for AF estimate
{
- int32_t *gt = &args->itmp[i*args->nitmp];
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ int ismpl = args->af_smpl->idx[i];
+ if ( bcf_gt_is_missing(gt[2*ismpl]) || bcf_gt_is_missing(gt[2*ismpl+1]) ) continue;
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+ if ( bcf_gt_allele(gt[2*ismpl]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[0]) ) nalt++;
- else nref++;
+ if ( bcf_gt_allele(gt[2*ismpl+1]) ) nalt++;
+ else nref++;
+ }
+ }
+ else // all samples used in AF estimate
+ {
+ int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr);
+ while ( gt < end )
+ {
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ if ( bcf_gt_allele(gt[0]) ) nalt++;
+ else nref++;
+
+ if ( bcf_gt_allele(gt[1]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[1]) ) nalt++;
- else nref++;
+ gt += 2;
+ }
}
if ( !nalt && !nref ) return -1;
return 0;
}
+int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_freq)
+{
+ double af = 0;
+ int i, j, naf = 0;
+
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+
+ if ( args->af_smpl ) // subset samples for AF estimate
+ {
+ #define BRANCH(type_t) \
+ { \
+ for (i=0; i<args->af_smpl->n; i++) \
+ { \
+ int ismpl = args->af_smpl->idx[i]; \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ else // all samples used in AF estimate
+ {
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p; \
+ p -= fmt_pl->n; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ p += fmt_pl->n; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ if ( !naf ) return -1;
+
+ *alt_freq = af / naf;
+ return 0;
+}
+
+bcf_fmt_t *get_PL(args_t *args, bcf1_t *line)
+{
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->pl_hdr_id ) return &line->d.fmt[i];
+ return NULL;
+}
-int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
+int process_line(args_t *args, bcf1_t *line, int ial)
{
- args->nitmp = 0;
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ double alt_freq;
+ int8_t *GTs = NULL;
+ bcf_fmt_t *fmt_pl = NULL;
// Set allele frequency
- int ret;
+ int ret = 0, i,j;
if ( args->af_tag )
{
// Use an INFO tag provided by the user
ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
- if ( ret==1 )
- *alt_freq = args->AFs[0];
+ if ( ret>0 )
+ alt_freq = args->AFs[ial-1];
if ( ret==-2 )
error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
}
else if ( args->af_fname )
{
// Read AF from a file
- ret = read_AF(args->files->targets, line, alt_freq);
+ ret = read_AF(args->files->targets, line, &alt_freq);
+ }
+ else if ( args->dflt_AF > 0 )
+ {
+ alt_freq = args->dflt_AF;
+ }
+ else if ( args->estimate_AF )
+ {
+ // Estimate AF from GTs or PLs of all samples or samples listed in a file
+ if ( args->af_from_PL )
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ ret = estimate_AF_from_PL(args, fmt_pl, ial, &alt_freq);
+ }
+ else
+ {
+ GTs = get_GT(args, line);
+ if ( !GTs ) return -1;
+ ret = estimate_AF_from_GT(args, GTs, &alt_freq);
+ }
}
else
{
- // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
- ret = -1;
- if ( !args->estimate_AF )
+ // Use AC/AN
+ int AC = -1, AN = 0;
+ ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
+ if ( ret==1 )
{
- int AC = -1, AN = 0;
- ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
- if ( ret==1 )
- {
- AN = args->itmp[0];
- ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
- if ( ret>0 )
- AC = args->itmp[0];
- }
- if ( AN<=0 || AC<0 )
- ret = -1;
- else
- *alt_freq = (double) AC/AN;
+ AN = args->itmp[0];
+ ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
+ if ( ret>0 )
+ AC = args->itmp[0];
}
- if ( ret==-1 )
- ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp
+ if ( AN<=0 || AC<0 )
+ ret = -1;
+ else
+ alt_freq = (double) AC/AN;
}
if ( ret<0 ) return ret;
- if ( *alt_freq==0.0 )
- {
- if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
- *alt_freq = args->dflt_AF;
- }
+ if ( alt_freq==0.0 ) return -1;
- // Set P(D|G)
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
if ( args->fake_PLs )
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ if ( !GTs ) GTs = get_GT(args, line);
+ }
+ else
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+ }
- int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ int ismpl = args->roh_smpl->idx[i];
- int a = bcf_gt_allele(gt[0]);
- int b = bcf_gt_allele(gt[1]);
- if ( a!=b )
- {
- pdg[0] = pdg[2] = args->unseen_PL;
- pdg[1] = 1 - 2*args->unseen_PL;
- }
- else if ( a==0 )
+ // set P(D|G)
+ double pdg[3];
+ if ( args->fake_PLs )
{
- pdg[0] = 1 - 2*args->unseen_PL;
- pdg[1] = pdg[2] = args->unseen_PL;
+ int8_t *gt = GTs + 2*ismpl;
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ int a = bcf_gt_allele(gt[0]);
+ int b = bcf_gt_allele(gt[1]);
+ if ( a!=b )
+ {
+ pdg[0] = pdg[2] = args->unseen_PL;
+ pdg[1] = 1 - 2*args->unseen_PL;
+ }
+ else if ( a==0 )
+ {
+ pdg[0] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = args->unseen_PL*args->unseen_PL;
+ }
+ else
+ {
+ pdg[0] = args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ }
}
else
{
- pdg[0] = pdg[1] = args->unseen_PL;
- pdg[2] = 1 - 2*args->unseen_PL;
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ pdg[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ pdg[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ pdg[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
}
- }
- else
- {
- args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
- if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
-
- int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
- pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
- pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
- pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;
double sum = pdg[0] + pdg[1] + pdg[2];
- if ( !sum ) return -1;
- pdg[0] /= sum;
- pdg[1] /= sum;
- pdg[2] /= sum;
+ if ( !sum ) continue;
+ for (j=0; j<3; j++) pdg[j] /= sum;
+ if ( args->skip_homref && pdg[0]>0.99 ) continue;
+
+ smpl_t *smpl = &args->smpl[i];
+ smpl->nused++;
+
+ if ( smpl->nsites >= smpl->msites )
+ {
+ hts_expand(uint32_t,smpl->nsites+1,smpl->msites,smpl->sites);
+ smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2);
+ if ( !smpl->eprob ) error("Error: failed to alloc %d bytes\n", sizeof(*smpl->eprob)*smpl->msites*2);
+ }
+
+ // Calculate emission probabilities P(D|AZ) and P(D|HW)
+ double *eprob = &smpl->eprob[2*smpl->nsites];
+ eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
+ eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
+
+ smpl->sites[smpl->nsites] = line->pos;
+ smpl->nsites++;
+
+ if ( args->vi_training )
+ {
+ if ( !smpl->nrid || line->rid!=smpl->rid[smpl->nrid-1] )
+ {
+ smpl->nrid++;
+ smpl->rid = (int*) realloc(smpl->rid,sizeof(*smpl->rid)*smpl->nrid);
+ smpl->rid[smpl->nrid-1] = line->rid;
+ smpl->rid_off = (int*) realloc(smpl->rid_off,sizeof(*smpl->rid_off)*smpl->nrid);
+ smpl->rid_off[smpl->nrid-1] = smpl->nsites - 1;
+ }
+ }
+ else if ( args->nbuf_max && smpl->nsites >= args->nbuf_max ) flush_viterbi(args, i);
}
return 0;
static void vcfroh(args_t *args, bcf1_t *line)
{
+ int i;
+
// Are we done?
if ( !line )
{
- flush_viterbi(args);
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
return;
}
args->ntot++;
- // Skip unwanted lines
+ // Skip unwanted lines, for simplicity we consider only biallelic sites
if ( line->rid == args->skip_rid ) return;
if ( line->n_allele==1 ) return; // no ALT allele
- if ( line->n_allele!=2 ) return; // only biallelic sites
+ if ( line->n_allele > 3 ) return; // cannot be bi-allelic, even with <*>
+
+ // This can be raw callable VCF with the symbolic unseen allele <*>
+ int ial = 0;
+ for (i=1; i<line->n_allele; i++)
+ if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
+ if ( ial==0 ) // normal VCF, the symbolic allele is not present
+ {
+ if ( line->n_allele!=2 ) return; // not biallelic
+ ial = 1;
+ }
+ else
+ {
+ if ( line->n_allele!=3 ) return; // not biallelic
+ ial = ial==1 ? 2 : 1; // <*> can come in any order
+ }
if ( args->snps_only && !bcf_is_snp(line) ) return;
// Initialize genetic map
args->prev_rid = line->rid;
args->prev_pos = line->pos;
skip_rid = load_genmap(args, line);
- if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
}
// New chromosome?
if ( args->prev_rid!=line->rid )
{
skip_rid = load_genmap(args, line);
- if ( args->vi_training )
- {
- if ( !skip_rid ) push_rid(args, line->rid);
- }
- else
+ if ( !args->vi_training )
{
- flush_viterbi(args);
- args->nsites = 0;
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
}
args->prev_rid = line->rid;
args->prev_pos = line->pos;
args->prev_pos = line->pos;
- // Ready for the new site
- int m = args->msites;
- hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
- if ( args->msites!=m )
- args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);
-
- // Set likelihoods and alternate allele frequencies
- double alt_freq, pdg[3];
- if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong
-
- args->nused++;
-
- // Calculate emission probabilities P(D|AZ) and P(D|HW)
- double *eprob = &args->eprob[2*args->nsites];
- eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
- eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
-
- args->sites[args->nsites] = line->pos;
- args->nsites++;
+ // parse the new line
+ process_line(args, line, ial);
}
static void usage(args_t *args)
fprintf(stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
fprintf(stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
- fprintf(stderr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(stderr, " -b --buffer-size <int[,int]> buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
+ fprintf(stderr, " If the first number is negative, it is interpreted as the maximum memory to\n");
+ fprintf(stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
+ fprintf(stderr, " -e, --estimate-AF [TAG],<file> estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n");
+ fprintf(stderr, " in <file>. If TAG is not given, the frequency is estimated from GT by default\n");
+ fprintf(stderr, " -G, --GTs-only <float> use GTs and ignore PLs, instead using <float> for PL of the two least likely genotypes.\n");
+ fprintf(stderr, " Safe value to use is 30 to account for GT errors.\n");
+ fprintf(stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n");
fprintf(stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
- fprintf(stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
+ fprintf(stderr, " is replaced with chromosome name\n");
fprintf(stderr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n");
fprintf(stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(stderr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(stderr, " -s, --samples <list> list of samples to analyze [all samples]\n");
+ fprintf(stderr, " -S, --samples-file <file> file of samples to analyze [all samples]\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "HMM Options:\n");
fprintf(stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
fprintf(stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
- fprintf(stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(stderr, " -V, --viterbi-training <float> estimate HMM parameters, <float> is the convergence threshold, e.g. 1e-10 (experimental)\n");
fprintf(stderr, "\n");
exit(1);
}
{"AF-tag",1,0,0},
{"AF-file",1,0,1},
{"AF-dflt",1,0,2},
+ {"buffer-size",1,0,'b'},
+ {"ignore-homref",0,0,'i'},
{"estimate-AF",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,0,'O'},
{"GTs-only",1,0,'G'},
- {"sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
{"hw-to-az",1,0,'a'},
{"az-to-hw",1,0,'H'},
- {"viterbi-training",0,0,'V'},
+ {"viterbi-training",1,0,'V'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
{"regions",1,0,'r'},
{"genetic-map",1,0,'m'},
{"rec-rate",1,0,'M'},
{"skip-indels",0,0,'I'},
+ {"threads",1,0,9},
{0,0,0,0}
};
int naf_opts = 0;
char *tmp;
- while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) {
switch (c) {
case 0: args->af_tag = optarg; naf_opts++; break;
case 1: args->af_fname = optarg; naf_opts++; break;
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
+ if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG;
+ if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ;
+ break;
case 'e': args->estimate_AF = optarg; naf_opts++; break;
+ case 'b': args->buffer_size = optarg; break;
+ case 'i': args->skip_homref = 1; break;
case 'I': args->snps_only = 1; break;
case 'G':
args->fake_PLs = 1;
args->rec_rate = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -M %s\n", optarg);
break;
- case 's': args->sample = strdup(optarg); break;
+ case 's': args->samples = strdup(optarg); break;
+ case 'S': args->samples = strdup(optarg); args->samples_is_file = 1; break;
case 'a':
args->t2AZ = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -a %s\n", optarg);
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
- case 'V': args->vi_training = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'V':
+ args->vi_training = 1;
+ args->baum_welch_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
}
+ if ( !args->output_fname ) args->output_fname = "stdout";
+ if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG;
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
- if ( argc<optind+1 ) usage(args);
+ if ( args->vi_training && args->buffer_size ) error("Error: cannot use -b with -V\n");
if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ);
if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW);
if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n");
if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
error("Failed to read the targets: %s\n", args->af_fname);
}
- if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
while ( bcf_sr_next_line(args->files) )
vcfroh(args, args->files->readers[0].buffer[0]);
}
vcfroh(args, NULL);
- fprintf(stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ int i, nmin = 0;
+ for (i=0; i<args->roh_smpl->n; i++)
+ if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused;
+ fprintf(stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
+ if ( nmin==0 )
+ {
+ fprintf(stderr,"No usable sites were found.");
+ if ( !naf_opts && !args->dflt_AF ) fprintf(stderr, " Consider using one of the AF options.\n");
+ }
destroy_data(args);
free(args);
return 0;
#include <htslib/synced_bcf_reader.h>
#include <htslib/kstring.h>
#include <htslib/kseq.h>
+#include <htslib/bgzf.h>
+#include <errno.h>
#include "bcftools.h"
#include "HMM.h"
+#include "smpl_ilist.h"
#define STATE_HW 0 // normal state, follows Hardy-Weinberg allele frequencies
#define STATE_AZ 1 // autozygous state
+#define OUTPUT_ST (1<<1)
+#define OUTPUT_RG (1<<2)
+#define OUTPUT_GZ (1<<3)
+
/** Genetic map */
typedef struct
{
}
genmap_t;
+/** HMM data for each sample */
+typedef struct
+{
+ double *eprob; // emission probs [2*nsites,msites]
+ uint32_t *sites; // positions [nsites,msites]
+ int nsites, msites;
+ int igenmap; // current position in genmap
+ int nused; // some stats to detect if things didn't go wrong
+ int nrid, *rid, *rid_off; // for viterbi training, keep all chromosomes
+ void *snapshot; // hmm snapshot
+ struct {
+ uint32_t beg,end,nqual;
+ double qual;
+ int rid, state;
+ } rg;
+}
+smpl_t;
+
typedef struct _args_t
{
bcf_srs_t *files;
double rec_rate; // constant recombination rate if > 0
hmm_t *hmm;
- double *eprob; // emission probs [2*nsites,msites]
- uint32_t *sites; // positions [nsites,msites]
- int nsites, msites;
+ double baum_welch_th;
int nrids, *rids, *rid_offs; // multiple chroms with vi_training
+ int nbuf_max, nbuf_olap;
- int32_t *itmp;
- int nitmp, mitmp;
float *AFs;
- int mAFs;
+ int32_t *itmp;
+ int mAFs, nitmp, mitmp, pl_hdr_id, gt_hdr_id;
double pl2p[256], *pdg;
int32_t skip_rid, prev_rid, prev_pos;
- int ntot, nused; // some stats to detect if things didn't go awfully wrong
- int ismpl, nsmpl; // index of query sample
- char *estimate_AF, *sample; // list of samples for AF estimate and query sample
- char **argv, *targets_list, *regions_list, *af_fname, *af_tag;
- int argc, fake_PLs, snps_only, vi_training;
+ int ntot; // some stats to detect if things didn't go wrong
+ smpl_t *smpl; // HMM data for each sample
+ smpl_ilist_t *af_smpl; // list of samples to estimate AF from (--estimate-AF)
+ smpl_ilist_t *roh_smpl; // list of samples to analyze (--samples, --samples-file)
+ char *estimate_AF; // list of samples for AF estimate and query sample
+ int af_from_PL; // estimate AF from FMT/PL rather than FMT/GT
+ char **argv, *targets_list, *regions_list, *af_fname, *af_tag, *samples, *buffer_size, *output_fname;
+ int argc, fake_PLs, snps_only, vi_training, samples_is_file, output_type, skip_homref, n_threads;
+ BGZF *out;
+ kstring_t str;
}
args_t;
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob);
void *smalloc(size_t size)
{
static void init_data(args_t *args)
{
+ int i;
+
args->prev_rid = args->skip_rid = -1;
args->hdr = args->files->readers[0].header;
- if ( !args->sample )
- {
- if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
- args->sample = strdup(args->hdr->samples[0]);
- }
if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");
- // Set samples
- kstring_t str = {0,0,0};
- if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
+ if ( !args->fake_PLs )
{
- int i, n;
- char **smpls = hts_readlist(args->estimate_AF, 1, &n);
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header, consider running with -G\n");
+ if ( bcf_hdr_id2type(args->hdr,BCF_HL_FMT,args->pl_hdr_id)!=BCF_HT_INT )
+ error("Error: The FORMAT/PL tag not defined as Integer in the header\n");
+ }
- // Make sure the query sample is included
- for (i=0; i<n; i++)
- if ( !strcmp(args->sample,smpls[i]) ) break;
+ if ( args->estimate_AF )
+ {
+ if ( !strncmp("GT,",args->estimate_AF,3) ) args->estimate_AF += 3;
+ else if ( !strncmp("PL,",args->estimate_AF,3) ) { args->estimate_AF += 3; args->af_from_PL = 1; }
+ if ( strcmp("-",args->estimate_AF) )
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
- // Add the query sample if not present
- if ( i!=n ) kputs(args->sample, &str);
+ if ( args->estimate_AF || args->fake_PLs )
+ {
+ if ( args->af_from_PL )
+ {
+ args->pl_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "PL");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->pl_hdr_id) )
+ error("Error: The FORMAT/PL tag not found in the header\n");
+ }
+ else
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
+ }
+ if ( args->fake_PLs )
+ {
+ args->gt_hdr_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "GT");
+ if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FMT,args->gt_hdr_id) )
+ error("Error: The FORMAT/GT tag not found in the header\n");
+ }
- for (i=0; i<n; i++)
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+ if ( args->samples )
+ {
+ // we may be able to subset to a few samples, for a text VCF this can be a major speedup
+ if ( (bcf_sr_get_reader(args->files,0))->file->format.format==vcf )
{
- if ( str.l ) kputc(',', &str);
- kputs(smpls[i], &str);
- free(smpls[i]);
+ kstring_t str = {0,0,0};
+ smpl_ilist_t *tmp = args->roh_smpl, *rmme = NULL;
+ if ( args->af_smpl )
+ {
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[args->roh_smpl->idx[i]], &str);
+ }
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ kputc(',', &str);
+ kputs(args->hdr->samples[args->af_smpl->idx[i]], &str);
+ }
+ rmme = tmp = smpl_ilist_init(args->hdr, str.s, 0, SMPL_NONE);
+ }
+ if ( tmp->n < bcf_hdr_nsamples(args->hdr) )
+ {
+ str.l = 0;
+ for (i=0; i<tmp->n; i++)
+ {
+ if ( str.l ) kputc(',', &str);
+ kputs(args->hdr->samples[tmp->idx[i]], &str);
+ }
+ int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
+ if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
+ else if ( ret>0 ) error("The %d-th sample not found in the VCF: %s\n", ret,str.s);
+
+ // update sample ids
+ smpl_ilist_destroy(args->roh_smpl);
+ args->roh_smpl = smpl_ilist_init(args->hdr, args->samples, args->samples_is_file, SMPL_NONE);
+
+ if ( args->af_smpl )
+ {
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = smpl_ilist_init(args->hdr, args->estimate_AF, 1, SMPL_NONE);
+ }
+ }
+ free(str.s);
+ if ( rmme )
+ smpl_ilist_destroy(rmme);
}
- free(smpls);
}
- else if ( !args->estimate_AF )
- kputs(args->sample, &str);
- if ( str.l )
+ // check whether all samples are in this list. If so, the lookup will not be needed
+ if ( args->af_smpl && args->af_smpl->n == bcf_hdr_nsamples(args->hdr) )
{
- int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
- if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
- else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
+ // all samples are in this list
+ smpl_ilist_destroy(args->af_smpl);
+ args->af_smpl = NULL;
}
- if ( args->af_tag )
- if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
- error("No such INFO tag in the VCF: %s\n", args->af_tag);
+ if ( args->buffer_size )
+ {
+ args->nbuf_olap = -1;
+ char *end;
+ double tmp = strtod(args->buffer_size,&end);
+ if ( *end )
+ {
+ if ( *end!=',') error("Could not parse: --buffer-size %s\n", args->buffer_size);
+ args->nbuf_olap = strtol(end+1,&end,10);
+ if ( *end || args->nbuf_olap<0 ) error("Could not parse: --bufer-size %s\n", args->buffer_size);
+ }
+ if ( tmp<0 )
+ args->nbuf_max = fabs(tmp)*1e6/(4+8*2)/args->roh_smpl->n;
+ else
+ args->nbuf_max = tmp;
- args->nsmpl = bcf_hdr_nsamples(args->hdr);
- args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
- free(str.s);
+ if ( args->nbuf_olap<0 )
+ args->nbuf_olap = args->nbuf_max*0.01;
+ }
+ fprintf(pysam_stderr,"Number of target samples: %d\n", args->roh_smpl->n);
+ fprintf(pysam_stderr,"Number of --estimate-AF samples: %d\n", args->af_smpl ? args->af_smpl->n : (args->estimate_AF ? bcf_hdr_nsamples(args->hdr) : 0));
+ fprintf(pysam_stderr,"Number of sites in the buffer/overlap: ");
+ if ( args->nbuf_max ) fprintf(pysam_stderr,"%d/%d\n", args->nbuf_max,args->nbuf_olap);
+ else fprintf(pysam_stderr,"unlimited\n");
+
+ args->smpl = (smpl_t*) calloc(args->roh_smpl->n,sizeof(smpl_t));
- int i;
for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);
// Init transition matrix and HMM
MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ args->hmm = hmm_init(2, tprob, 10000);
if ( args->genmap_fname )
- {
- args->hmm = hmm_init(2, tprob, 0);
hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
- }
else if ( args->rec_rate > 0 )
- {
- args->hmm = hmm_init(2, tprob, 0);
- hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);
+ hmm_set_tprob_func(args->hmm, set_tprob_rrate, args);
- }
- else
- args->hmm = hmm_init(2, tprob, 10000);
+ args->out = bgzf_open(strcmp("pysam_stdout",args->output_fname)?args->output_fname:"-", args->output_type&OUTPUT_GZ ? "wg" : "wu");
+ if ( !args->out ) error("Failed to open %s: %s\n", args->output_fname, strerror(errno));
// print header
- fprintf(pysam_stdout, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
- fprintf(pysam_stdout, "# The command line was:\tbcftools %s", args->argv[0]);
+ args->str.l = 0;
+ ksprintf(&args->str, "# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
+ ksprintf(&args->str, "# The command line was:\tbcftools %s", args->argv[0]);
for (i=1; i<args->argc; i++)
- fprintf(pysam_stdout, " %s",args->argv[i]);
- fprintf(pysam_stdout, "\n#\n");
- fprintf(pysam_stdout, "# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
+ ksprintf(&args->str, " %s",args->argv[i]);
+ ksprintf(&args->str, "\n#\n");
+ if ( args->output_type & OUTPUT_RG )
+ {
+ i = 2;
+ ksprintf(&args->str, "# RG");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Start", i++);
+ ksprintf(&args->str, "\t[%d]End", i++);
+ ksprintf(&args->str, "\t[%d]Length (bp)", i++);
+ ksprintf(&args->str, "\t[%d]Number of markers", i++);
+ ksprintf(&args->str, "\t[%d]Quality (average fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->output_type & OUTPUT_ST )
+ {
+ i = 2;
+ ksprintf(&args->str, "# ST");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Chromosome", i++);
+ ksprintf(&args->str, "\t[%d]Position", i++);
+ ksprintf(&args->str, "\t[%d]State (0:HW, 1:AZ)", i++);
+ ksprintf(&args->str, "\t[%d]Quality (fwd-bwd phred score)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( args->vi_training)
+ {
+ i = 2;
+ ksprintf(&args->str, "# VT, Viterbi Training");
+ ksprintf(&args->str, "\t[%d]Sample", i++);
+ ksprintf(&args->str, "\t[%d]Iteration", i++);
+ ksprintf(&args->str, "\t[%d]dAZ", i++);
+ ksprintf(&args->str, "\t[%d]dHW", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(HW|HW)", i++);
+ ksprintf(&args->str, "\t[%d]P(AZ|HW)", i++);
+ ksprintf(&args->str, "\t[%d]1 - P(AZ|AZ)", i++);
+ ksprintf(&args->str, "\t[%d]P(HW|AZ)", i++);
+ ksprintf(&args->str, "\n");
+ }
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l )
+ error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
static void destroy_data(args_t *args)
{
- free(args->sites);
- free(args->eprob);
- free(args->sample);
+ if ( bgzf_close(args->out)!=0 ) error("Error: close failed .. %s\n", args->output_fname);
+ int i;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ free(args->smpl[i].eprob);
+ free(args->smpl[i].sites);
+ free(args->smpl[i].rid);
+ free(args->smpl[i].rid_off);
+ free(args->smpl[i].snapshot);
+ }
+ free(args->str.s);
+ free(args->smpl);
+ if ( args->af_smpl ) smpl_ilist_destroy(args->af_smpl);
+ smpl_ilist_destroy(args->roh_smpl);
free(args->rids);
free(args->rid_offs);
hmm_destroy(args->hmm);
bcf_sr_destroy(args->files);
- free(args->itmp); free(args->AFs); free(args->pdg);
+ free(args->AFs); free(args->pdg);
free(args->genmap);
+ free(args->itmp);
+ free(args->samples);
}
static int load_genmap(args_t *args, bcf1_t *line)
hts_expand(genmap_t,args->ngenmap,args->mgenmap,args->genmap);
genmap_t *gm = &args->genmap[args->ngenmap-1];
+ // position, convert to 0-based
char *tmp, *end;
gm->pos = strtol(str.s, &tmp, 10);
if ( str.s==tmp ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->pos -= 1;
// skip second column
tmp++;
while ( *tmp && !isspace(*tmp) ) tmp++;
- // read the genetic map in cM
+ // read the genetic map in cM, scale from % to likelihood
gm->rate = strtod(tmp+1, &end);
if ( tmp+1==end ) error("Could not parse %s: %s\n", fname, str.s);
+ gm->rate *= 0.01;
}
if ( !args->ngenmap ) error("Genetic map empty?\n");
- int i;
- for (i=0; i<args->ngenmap; i++) args->genmap[i].rate /= args->genmap[args->ngenmap-1].rate; // scale to 1
if ( hts_close(fp) ) error("Close failed\n");
free(str.s);
return 0;
// position j to be equal or larger than end
int j = i;
while ( j+1<args->ngenmap && args->genmap[j].pos < end ) j++;
-
if ( i==j )
{
args->igenmap = i;
void set_tprob_genmap(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
- double ci = get_genmap_rate(args, pos - prev_pos, pos);
+ double ci = get_genmap_rate(args, prev_pos, pos);
+ if ( args->rec_rate ) ci *= args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
MAT(tprob,2,STATE_HW,STATE_HW) = 1 - MAT(tprob,2,STATE_AZ,STATE_HW);
}
-void set_tprob_recrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
+void set_tprob_rrate(hmm_t *hmm, uint32_t prev_pos, uint32_t pos, void *data, double *tprob)
{
args_t *args = (args_t*) data;
double ci = (pos - prev_pos) * args->rec_rate;
+ if ( ci > 1 ) ci = 1;
MAT(tprob,2,STATE_HW,STATE_AZ) *= ci;
MAT(tprob,2,STATE_AZ,STATE_HW) *= ci;
MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - MAT(tprob,2,STATE_HW,STATE_AZ);
*
*/
-static void flush_viterbi(args_t *args)
+static void flush_viterbi(args_t *args, int ismpl)
{
- int i,j;
+ smpl_t *smpl = &args->smpl[ismpl];
+ if ( !smpl->nsites ) return;
- if ( !args->nsites ) return;
+ const char *name = args->hdr->samples[ args->roh_smpl->idx[ismpl] ];
- if ( !args->vi_training )
+ int i,j,k;
+
+ if ( !args->vi_training ) // single viterbi pass
{
- // single viterbi pass, one chromsome
- hmm_run_viterbi(args->hmm, args->nsites, args->eprob, args->sites);
- hmm_run_fwd_bwd(args->hmm, args->nsites, args->eprob, args->sites);
+ hmm_restore(args->hmm, smpl->snapshot);
+ int end = (args->nbuf_max && smpl->nsites >= args->nbuf_max && smpl->nsites > args->nbuf_olap) ? smpl->nsites - args->nbuf_olap : smpl->nsites;
+ if ( end < smpl->nsites )
+ smpl->snapshot = hmm_snapshot(args->hmm, smpl->snapshot, smpl->nsites - args->nbuf_olap - 1);
+
+ args->igenmap = smpl->igenmap;
+ hmm_run_viterbi(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
+ hmm_run_fwd_bwd(args->hmm, smpl->nsites, smpl->eprob, smpl->sites);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (i=0; i<args->nsites; i++)
+ const char *chr = bcf_hdr_id2name(args->hdr,args->prev_rid);
+ uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
+
+ for (i=0; i<end; i++)
{
int state = vpath[i*2]==STATE_AZ ? 1 : 0;
- double *pval = fwd + i*2;
- fprintf(pysam_stdout, "%s\t%d\t%d\t%.1f\n", chr,args->sites[i]+1, state, phred_score(1.0-pval[state]));
- }
- return;
- }
+ double qual = phred_score(1.0 - fwd[i*2 + state]);
+ if ( args->output_type & OUTPUT_ST )
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "ST\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[i]+1, state, qual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ }
- // viterbi training, multiple chromosomes
- double t2az_prev, t2hw_prev;
- double deltaz, delthw;
- int niter = 0;
- do
- {
- double *tprob_arr = hmm_get_tprob(args->hmm);
- t2az_prev = MAT(tprob_arr,2,1,0); //args->t2AZ;
- t2hw_prev = MAT(tprob_arr,2,0,1); //args->t2HW;
- double tcounts[] = { 0,0,0,0 };
- for (i=0; i<args->nrids; i++)
- {
- // run viterbi for each chromosomes. eprob and sites contain
- // multiple chromosomes, rid_offs mark the boundaries
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
-
- // what transitions were observed: add to the total counts
- uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
- for (j=1; j<nsites; j++)
+ if ( args->output_type & OUTPUT_RG )
{
- // count the number of transitions
- int prev_state = vpath[2*(j-1)];
- int curr_state = vpath[2*j];
- MAT(tcounts,2,curr_state,prev_state) += 1;
+ if ( state!=smpl->rg.state )
+ {
+ if ( !state ) // the region ends, flush
+ {
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
+ }
+ else
+ {
+ smpl->rg.state = 1;
+ smpl->rg.beg = smpl->sites[i];
+ smpl->rg.rid = args->prev_rid;
+ }
+ }
+ else if ( state )
+ {
+ smpl->rg.nqual++;
+ smpl->rg.qual += qual;
+ smpl->rg.end = smpl->sites[i];
+ }
}
}
- // update the transition matrix
- int n = 1;
- for (i=0; i<2; i++)
+ if ( end < smpl->nsites )
{
- for (j=0; j<2; j++) n += MAT(tcounts,2,i,j);
+ end = smpl->nsites - args->nbuf_olap;
+ memmove(smpl->sites, smpl->sites + end, sizeof(*smpl->sites)*args->nbuf_olap);
+ memmove(smpl->eprob, smpl->eprob + end*2, sizeof(*smpl->eprob)*args->nbuf_olap*2);
+ smpl->nsites = args->nbuf_olap;
+ smpl->igenmap = args->igenmap;
}
- for (i=0; i<2; i++)
+ else
{
- for (j=0; j<2; j++)
+ smpl->nsites = 0;
+ smpl->igenmap = 0;
+
+ if ( smpl->rg.state )
{
- // no transition to i-th state was observed, set to a small number
- if ( !MAT(tcounts,2,i,j) ) MAT(tcounts,2,i,j) = 0.1/n;
- else MAT(tcounts,2,i,j) /= n;
+ args->str.l = 0;
+ ksprintf(&args->str, "RG\t%s\t%s\t%d\t%d\t%d\t%d\t%.1f\n",name,bcf_hdr_id2name(args->hdr,smpl->rg.rid),
+ smpl->rg.beg+1,smpl->rg.end+1,smpl->rg.end-smpl->rg.beg+1,smpl->rg.nqual,smpl->rg.qual/smpl->rg.nqual);
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
+ smpl->rg.state = 0;
}
}
- // normalize
- for (i=0; i<2; i++)
+ return;
+ }
+
+
+ // viterbi training, multiple chromosomes
+ double t2az_prev, t2hw_prev;
+ double deltaz, delthw;
+
+ double *tprob_arr = hmm_get_tprob(args->hmm);
+ MAT(tprob_arr,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
+ MAT(tprob_arr,2,STATE_HW,STATE_AZ) = args->t2HW;
+ MAT(tprob_arr,2,STATE_AZ,STATE_HW) = args->t2AZ;
+ MAT(tprob_arr,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW;
+ hmm_set_tprob(args->hmm, tprob_arr, 10000);
+
+ int niter = 0;
+ do
+ {
+ tprob_arr = hmm_get_tprob(args->hmm);
+ t2az_prev = MAT(tprob_arr,2,STATE_AZ,STATE_HW); //args->t2AZ;
+ t2hw_prev = MAT(tprob_arr,2,STATE_HW,STATE_AZ); //args->t2HW;
+ double tprob_new[] = { 0,0,0,0 };
+ for (i=0; i<smpl->nrid; i++)
{
- double norm = 0;
- for (j=0; j<2; j++) norm += MAT(tcounts,2,j,i);
- assert( norm!=0 );
- for (j=0; j<2; j++) MAT(tcounts,2,j,i) /= norm;
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ tprob_arr = hmm_run_baum_welch(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) += MAT(tprob_arr,2,j,k);
}
+ for (j=0; j<2; j++)
+ for (k=0; k<2; k++) MAT(tprob_new,2,j,k) /= smpl->nrid;
- if ( args->genmap_fname || args->rec_rate > 0 )
- hmm_set_tprob(args->hmm, tcounts, 0);
- else
- hmm_set_tprob(args->hmm, tcounts, 10000);
+ hmm_set_tprob(args->hmm, tprob_new, 10000);
- tprob_arr = hmm_get_tprob(args->hmm);
- deltaz = fabs(MAT(tprob_arr,2,1,0)-t2az_prev);
- delthw = fabs(MAT(tprob_arr,2,0,1)-t2hw_prev);
+ deltaz = fabs(MAT(tprob_new,2,STATE_AZ,STATE_HW)-t2az_prev);
+ delthw = fabs(MAT(tprob_new,2,STATE_HW,STATE_AZ)-t2hw_prev);
niter++;
- fprintf(pysam_stderr,"Viterbi training, iteration %d: dAZ=%e dHW=%e\tP(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n",
- niter,deltaz,delthw,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ args->str.l = 0;
+ ksprintf(&args->str, "VT\t%s\t%d\t%e\t%e\t%e\t%e\t%e\t%e\n",
+ name,niter,deltaz,delthw,
+ 1-MAT(tprob_new,2,STATE_HW,STATE_HW),MAT(tprob_new,2,STATE_AZ,STATE_HW),
+ 1-MAT(tprob_new,2,STATE_AZ,STATE_AZ),MAT(tprob_new,2,STATE_HW,STATE_AZ));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
- while ( deltaz > 0.0 || delthw > 0.0 );
- double *tprob_arr = hmm_get_tprob(args->hmm);
- fprintf(pysam_stderr, "Viterbi training converged in %d iterations to P(HW|HW)=%e P(AZ|HW)=%e P(AZ|AZ)=%e P(HW|AZ)=%e\n", niter,
- MAT(tprob_arr,2,STATE_HW,STATE_HW),MAT(tprob_arr,2,STATE_AZ,STATE_HW),
- MAT(tprob_arr,2,STATE_AZ,STATE_AZ),MAT(tprob_arr,2,STATE_HW,STATE_AZ));
+ while ( deltaz > args->baum_welch_th || delthw > args->baum_welch_th );
// output the results
- for (i=0; i<args->nrids; i++)
+ for (i=0; i<smpl->nrid; i++)
{
- int ioff = args->rid_offs[i];
- int nsites = (i+1==args->nrids ? args->nsites : args->rid_offs[i+1]) - ioff;
- hmm_run_viterbi(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
- hmm_run_fwd_bwd(args->hmm, nsites, args->eprob+ioff*2, args->sites+ioff);
+ int ioff = smpl->rid_off[i];
+ int nsites = (i+1==smpl->nrid ? smpl->nsites : smpl->rid_off[i+1]) - ioff;
+ args->igenmap = 0;
+ hmm_run_viterbi(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
+ hmm_run_fwd_bwd(args->hmm, nsites, smpl->eprob+ioff*2, smpl->sites+ioff);
uint8_t *vpath = hmm_get_viterbi_path(args->hmm);
double *fwd = hmm_get_fwd_bwd_prob(args->hmm);
- const char *chr = bcf_hdr_id2name(args->hdr,args->rids[i]);
+ const char *chr = bcf_hdr_id2name(args->hdr,smpl->rid[i]);
for (j=0; j<nsites; j++)
{
- int state = vpath[j*2];
- double pval = fwd[j*2 + state];
- fprintf(pysam_stdout, "%s\t%d\t%d\t%e\n", chr,args->sites[ioff+j]+1,state==STATE_AZ ? 1 : 0, pval);
+ int state = vpath[j*2]==STATE_AZ ? 1 : 0;
+ double *pval = fwd + j*2;
+ args->str.l = 0;
+ ksprintf(&args->str, "ROH\t%s\t%s\t%d\t%d\t%.1f\n", name,chr,smpl->sites[ioff+j]+1, state, phred_score(1.0-pval[state]));
+ if ( bgzf_write(args->out, args->str.s, args->str.l) != args->str.l ) error("Error writing %s: %s\n", args->output_fname, strerror(errno));
}
}
}
-static void push_rid(args_t *args, int rid)
-{
- args->nrids++;
- args->rids = (int*) realloc(args->rids, args->nrids*sizeof(int));
- args->rid_offs = (int*) realloc(args->rid_offs, args->nrids*sizeof(int));
- args->rids[ args->nrids-1 ] = rid;
- args->rid_offs[ args->nrids-1 ] = args->nsites;
-}
int read_AF(bcf_sr_regions_t *tgt, bcf1_t *line, double *alt_freq)
{
return 0;
}
-int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
+int8_t *get_GT(args_t *args, bcf1_t *line)
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->gt_hdr_id ) break;
+ if ( i==line->n_fmt ) return NULL; // the tag is not present in this record
+
+ bcf_fmt_t *fmt = &line->d.fmt[i];
+ if ( fmt->n!=2 ) return NULL; // not diploid
+ if ( fmt->type!=BCF_BT_INT8 ) error("This is unexpected, GT type is %d\n", fmt->type);
+ return (int8_t*) fmt->p;
+}
+
+int estimate_AF_from_GT(args_t *args, int8_t *gt, double *alt_freq)
+{
int i, nalt = 0, nref = 0;
- for (i=0; i<args->nsmpl; i++)
+ if ( args->af_smpl ) // subset samples for AF estimate
{
- int32_t *gt = &args->itmp[i*args->nitmp];
+ for (i=0; i<args->af_smpl->n; i++)
+ {
+ int ismpl = args->af_smpl->idx[i];
+ if ( bcf_gt_is_missing(gt[2*ismpl]) || bcf_gt_is_missing(gt[2*ismpl+1]) ) continue;
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+ if ( bcf_gt_allele(gt[2*ismpl]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[0]) ) nalt++;
- else nref++;
+ if ( bcf_gt_allele(gt[2*ismpl+1]) ) nalt++;
+ else nref++;
+ }
+ }
+ else // all samples used in AF estimate
+ {
+ int8_t *end = gt + 2*bcf_hdr_nsamples(args->hdr);
+ while ( gt < end )
+ {
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ if ( bcf_gt_allele(gt[0]) ) nalt++;
+ else nref++;
+
+ if ( bcf_gt_allele(gt[1]) ) nalt++;
+ else nref++;
- if ( bcf_gt_allele(gt[1]) ) nalt++;
- else nref++;
+ gt += 2;
+ }
}
if ( !nalt && !nref ) return -1;
return 0;
}
+int estimate_AF_from_PL(args_t *args, bcf_fmt_t *fmt_pl, int ial, double *alt_freq)
+{
+ double af = 0;
+ int i, j, naf = 0;
+
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+
+ if ( args->af_smpl ) // subset samples for AF estimate
+ {
+ #define BRANCH(type_t) \
+ { \
+ for (i=0; i<args->af_smpl->n; i++) \
+ { \
+ int ismpl = args->af_smpl->idx[i]; \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ else // all samples used in AF estimate
+ {
+ int nsmpl = bcf_hdr_nsamples(args->hdr);
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p; \
+ p -= fmt_pl->n; \
+ for (i=0; i<nsmpl; i++) \
+ { \
+ p += fmt_pl->n; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ double prob[3], norm = 0; \
+ prob[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ prob[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ prob[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ for (j=0; j<3; j++) norm += prob[j]; \
+ for (j=0; j<3; j++) prob[j] /= norm; \
+ af += 0.5*prob[1] + prob[2]; \
+ naf++; \
+ } \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
+ }
+ if ( !naf ) return -1;
+
+ *alt_freq = af / naf;
+ return 0;
+}
+
+bcf_fmt_t *get_PL(args_t *args, bcf1_t *line)
+{
+ int i;
+ for (i=0; i<line->n_fmt; i++)
+ if ( line->d.fmt[i].id==args->pl_hdr_id ) return &line->d.fmt[i];
+ return NULL;
+}
-int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
+int process_line(args_t *args, bcf1_t *line, int ial)
{
- args->nitmp = 0;
+ if ( !(line->unpacked & BCF_UN_FMT) ) bcf_unpack(line, BCF_UN_FMT);
+
+ double alt_freq;
+ int8_t *GTs = NULL;
+ bcf_fmt_t *fmt_pl = NULL;
// Set allele frequency
- int ret;
+ int ret = 0, i,j;
if ( args->af_tag )
{
// Use an INFO tag provided by the user
ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
- if ( ret==1 )
- *alt_freq = args->AFs[0];
+ if ( ret>0 )
+ alt_freq = args->AFs[ial-1];
if ( ret==-2 )
error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
}
else if ( args->af_fname )
{
// Read AF from a file
- ret = read_AF(args->files->targets, line, alt_freq);
+ ret = read_AF(args->files->targets, line, &alt_freq);
+ }
+ else if ( args->dflt_AF > 0 )
+ {
+ alt_freq = args->dflt_AF;
+ }
+ else if ( args->estimate_AF )
+ {
+ // Estimate AF from GTs or PLs of all samples or samples listed in a file
+ if ( args->af_from_PL )
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ ret = estimate_AF_from_PL(args, fmt_pl, ial, &alt_freq);
+ }
+ else
+ {
+ GTs = get_GT(args, line);
+ if ( !GTs ) return -1;
+ ret = estimate_AF_from_GT(args, GTs, &alt_freq);
+ }
}
else
{
- // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
- ret = -1;
- if ( !args->estimate_AF )
+ // Use AC/AN
+ int AC = -1, AN = 0;
+ ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
+ if ( ret==1 )
{
- int AC = -1, AN = 0;
- ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
- if ( ret==1 )
- {
- AN = args->itmp[0];
- ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
- if ( ret>0 )
- AC = args->itmp[0];
- }
- if ( AN<=0 || AC<0 )
- ret = -1;
- else
- *alt_freq = (double) AC/AN;
+ AN = args->itmp[0];
+ ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
+ if ( ret>0 )
+ AC = args->itmp[0];
}
- if ( ret==-1 )
- ret = estimate_AF(args, line, alt_freq); // reads GTs into args->itmp
+ if ( AN<=0 || AC<0 )
+ ret = -1;
+ else
+ alt_freq = (double) AC/AN;
}
if ( ret<0 ) return ret;
- if ( *alt_freq==0.0 )
- {
- if ( args->dflt_AF==0 ) return -1; // we skip sites with AF=0
- *alt_freq = args->dflt_AF;
- }
+ if ( alt_freq==0.0 ) return -1;
- // Set P(D|G)
+ int irr = bcf_alleles2gt(0,0), ira = bcf_alleles2gt(0,ial), iaa = bcf_alleles2gt(ial,ial);
if ( args->fake_PLs )
{
- if ( !args->nitmp )
- {
- args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
- if ( args->nitmp != 2*args->nsmpl ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
- }
+ if ( !GTs ) GTs = get_GT(args, line);
+ }
+ else
+ {
+ fmt_pl = get_PL(args, line);
+ if ( !fmt_pl ) return -1;
+ if ( iaa >= fmt_pl->n ) return -1; // not diploid or wrong number of fields
+ }
- int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
- if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;
+ for (i=0; i<args->roh_smpl->n; i++)
+ {
+ int ismpl = args->roh_smpl->idx[i];
- int a = bcf_gt_allele(gt[0]);
- int b = bcf_gt_allele(gt[1]);
- if ( a!=b )
- {
- pdg[0] = pdg[2] = args->unseen_PL;
- pdg[1] = 1 - 2*args->unseen_PL;
- }
- else if ( a==0 )
+ // set P(D|G)
+ double pdg[3];
+ if ( args->fake_PLs )
{
- pdg[0] = 1 - 2*args->unseen_PL;
- pdg[1] = pdg[2] = args->unseen_PL;
+ int8_t *gt = GTs + 2*ismpl;
+ if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;
+
+ int a = bcf_gt_allele(gt[0]);
+ int b = bcf_gt_allele(gt[1]);
+ if ( a!=b )
+ {
+ pdg[0] = pdg[2] = args->unseen_PL;
+ pdg[1] = 1 - 2*args->unseen_PL;
+ }
+ else if ( a==0 )
+ {
+ pdg[0] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = args->unseen_PL*args->unseen_PL;
+ }
+ else
+ {
+ pdg[0] = args->unseen_PL*args->unseen_PL;
+ pdg[1] = args->unseen_PL;
+ pdg[2] = 1 - args->unseen_PL - args->unseen_PL*args->unseen_PL;
+ }
}
else
{
- pdg[0] = pdg[1] = args->unseen_PL;
- pdg[2] = 1 - 2*args->unseen_PL;
+ #define BRANCH(type_t) \
+ { \
+ type_t *p = (type_t*)fmt_pl->p + fmt_pl->n*ismpl; \
+ if ( p[irr]<0 || p[ira]<0 || p[iaa]<0 ) continue; /* missing value */ \
+ if ( p[irr]==p[ira] && p[irr]==p[iaa] ) continue; /* all values are the same */ \
+ pdg[0] = p[irr] < (type_t)256 ? args->pl2p[ p[irr] ] : args->pl2p[255]; \
+ pdg[1] = p[ira] < (type_t)256 ? args->pl2p[ p[ira] ] : args->pl2p[255]; \
+ pdg[2] = p[iaa] < (type_t)256 ? args->pl2p[ p[iaa] ] : args->pl2p[255]; \
+ }
+ switch (fmt_pl->type) {
+ case BCF_BT_INT8: BRANCH(int8_t); break;
+ case BCF_BT_INT16: BRANCH(int16_t); break;
+ case BCF_BT_INT32: BRANCH(int32_t); break;
+ default: fprintf(pysam_stderr,"Unknown format type for PL: %s:%d .. fmt->type=%d\n", __FILE__,__LINE__, fmt_pl->type); exit(1);
+ }
+ #undef BRANCH
}
- }
- else
- {
- args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
- if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1; // not diploid?
- args->nitmp /= args->nsmpl;
-
- int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
- pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
- pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
- pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;
double sum = pdg[0] + pdg[1] + pdg[2];
- if ( !sum ) return -1;
- pdg[0] /= sum;
- pdg[1] /= sum;
- pdg[2] /= sum;
+ if ( !sum ) continue;
+ for (j=0; j<3; j++) pdg[j] /= sum;
+ if ( args->skip_homref && pdg[0]>0.99 ) continue;
+
+ smpl_t *smpl = &args->smpl[i];
+ smpl->nused++;
+
+ if ( smpl->nsites >= smpl->msites )
+ {
+ hts_expand(uint32_t,smpl->nsites+1,smpl->msites,smpl->sites);
+ smpl->eprob = (double*) realloc(smpl->eprob,sizeof(*smpl->eprob)*smpl->msites*2);
+ if ( !smpl->eprob ) error("Error: failed to alloc %d bytes\n", sizeof(*smpl->eprob)*smpl->msites*2);
+ }
+
+ // Calculate emission probabilities P(D|AZ) and P(D|HW)
+ double *eprob = &smpl->eprob[2*smpl->nsites];
+ eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
+ eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
+
+ smpl->sites[smpl->nsites] = line->pos;
+ smpl->nsites++;
+
+ if ( args->vi_training )
+ {
+ if ( !smpl->nrid || line->rid!=smpl->rid[smpl->nrid-1] )
+ {
+ smpl->nrid++;
+ smpl->rid = (int*) realloc(smpl->rid,sizeof(*smpl->rid)*smpl->nrid);
+ smpl->rid[smpl->nrid-1] = line->rid;
+ smpl->rid_off = (int*) realloc(smpl->rid_off,sizeof(*smpl->rid_off)*smpl->nrid);
+ smpl->rid_off[smpl->nrid-1] = smpl->nsites - 1;
+ }
+ }
+ else if ( args->nbuf_max && smpl->nsites >= args->nbuf_max ) flush_viterbi(args, i);
}
return 0;
static void vcfroh(args_t *args, bcf1_t *line)
{
+ int i;
+
// Are we done?
if ( !line )
{
- flush_viterbi(args);
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
return;
}
args->ntot++;
- // Skip unwanted lines
+ // Skip unwanted lines, for simplicity we consider only biallelic sites
if ( line->rid == args->skip_rid ) return;
if ( line->n_allele==1 ) return; // no ALT allele
- if ( line->n_allele!=2 ) return; // only biallelic sites
+ if ( line->n_allele > 3 ) return; // cannot be bi-allelic, even with <*>
+
+ // This can be raw callable VCF with the symbolic unseen allele <*>
+ int ial = 0;
+ for (i=1; i<line->n_allele; i++)
+ if ( !strcmp("<*>",line->d.allele[i]) ) { ial = i; break; }
+ if ( ial==0 ) // normal VCF, the symbolic allele is not present
+ {
+ if ( line->n_allele!=2 ) return; // not biallelic
+ ial = 1;
+ }
+ else
+ {
+ if ( line->n_allele!=3 ) return; // not biallelic
+ ial = ial==1 ? 2 : 1; // <*> can come in any order
+ }
if ( args->snps_only && !bcf_is_snp(line) ) return;
// Initialize genetic map
args->prev_rid = line->rid;
args->prev_pos = line->pos;
skip_rid = load_genmap(args, line);
- if ( !skip_rid && args->vi_training ) push_rid(args, line->rid);
}
// New chromosome?
if ( args->prev_rid!=line->rid )
{
skip_rid = load_genmap(args, line);
- if ( args->vi_training )
- {
- if ( !skip_rid ) push_rid(args, line->rid);
- }
- else
+ if ( !args->vi_training )
{
- flush_viterbi(args);
- args->nsites = 0;
+ for (i=0; i<args->roh_smpl->n; i++) flush_viterbi(args, i);
}
args->prev_rid = line->rid;
args->prev_pos = line->pos;
args->prev_pos = line->pos;
- // Ready for the new site
- int m = args->msites;
- hts_expand(uint32_t,args->nsites+1,args->msites,args->sites);
- if ( args->msites!=m )
- args->eprob = (double*) realloc(args->eprob,sizeof(double)*args->msites*2);
-
- // Set likelihoods and alternate allele frequencies
- double alt_freq, pdg[3];
- if ( parse_line(args, line, &alt_freq, pdg)<0 ) return; // something went wrong
-
- args->nused++;
-
- // Calculate emission probabilities P(D|AZ) and P(D|HW)
- double *eprob = &args->eprob[2*args->nsites];
- eprob[STATE_AZ] = pdg[0]*(1-alt_freq) + pdg[2]*alt_freq;
- eprob[STATE_HW] = pdg[0]*(1-alt_freq)*(1-alt_freq) + 2*pdg[1]*(1-alt_freq)*alt_freq + pdg[2]*alt_freq*alt_freq;
-
- args->sites[args->nsites] = line->pos;
- args->nsites++;
+ // parse the new line
+ process_line(args, line, ial);
}
static void usage(args_t *args)
fprintf(pysam_stderr, " --AF-dflt <float> if AF is not known, use this allele frequency [skip]\n");
fprintf(pysam_stderr, " --AF-tag <TAG> use TAG for allele frequency\n");
fprintf(pysam_stderr, " --AF-file <file> read allele frequencies from file (CHR\\tPOS\\tREF,ALT\\tAF)\n");
- fprintf(pysam_stderr, " -e, --estimate-AF <file> calculate AC,AN counts on the fly, using either all samples (\"-\") or samples listed in <file>\n");
- fprintf(pysam_stderr, " -G, --GTs-only <float> use GTs, ignore PLs, use <float> for PL of unseen genotypes. Safe value to use is 30 to account for GT errors.\n");
+ fprintf(pysam_stderr, " -b --buffer-size <int[,int]> buffer size and the number of overlapping sites, 0 for unlimited [0]\n");
+ fprintf(pysam_stderr, " If the first number is negative, it is interpreted as the maximum memory to\n");
+ fprintf(pysam_stderr, " use, in MB. The default overlap is set to roughly 1%% of the buffer size.\n");
+ fprintf(pysam_stderr, " -e, --estimate-AF [TAG],<file> estimate AF from FORMAT/TAG (GT or PL) of all samples (\"-\") or samples listed\n");
+ fprintf(pysam_stderr, " in <file>. If TAG is not given, the frequency is estimated from GT by default\n");
+ fprintf(pysam_stderr, " -G, --GTs-only <float> use GTs and ignore PLs, instead using <float> for PL of the two least likely genotypes.\n");
+ fprintf(pysam_stderr, " Safe value to use is 30 to account for GT errors.\n");
+ fprintf(pysam_stderr, " -i, --ignore-homref skip hom-ref genotypes (0/0)\n");
fprintf(pysam_stderr, " -I, --skip-indels skip indels as their genotypes are enriched for errors\n");
- fprintf(pysam_stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\" is replaced with chromosome name\n");
+ fprintf(pysam_stderr, " -m, --genetic-map <file> genetic map in IMPUTE2 format, single file or mask, where string \"{CHROM}\"\n");
+ fprintf(pysam_stderr, " is replaced with chromosome name\n");
fprintf(pysam_stderr, " -M, --rec-rate <float> constant recombination rate per bp\n");
+ fprintf(pysam_stderr, " -o, --output <file> write output to a file [standard output]\n");
+ fprintf(pysam_stderr, " -O, --output-type [srz] output s:per-site, r:regions, z:compressed [sr]\n");
fprintf(pysam_stderr, " -r, --regions <region> restrict to comma-separated list of regions\n");
fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
- fprintf(pysam_stderr, " -s, --sample <sample> sample to analyze\n");
+ fprintf(pysam_stderr, " -s, --samples <list> list of samples to analyze [all samples]\n");
+ fprintf(pysam_stderr, " -S, --samples-file <file> file of samples to analyze [all samples]\n");
fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "HMM Options:\n");
fprintf(pysam_stderr, " -a, --hw-to-az <float> P(AZ|HW) transition probability from HW (Hardy-Weinberg) to AZ (autozygous) state [6.7e-8]\n");
fprintf(pysam_stderr, " -H, --az-to-hw <float> P(HW|AZ) transition probability from AZ to HW state [5e-9]\n");
- fprintf(pysam_stderr, " -V, --viterbi-training perform Viterbi training to estimate transition probabilities\n");
+ fprintf(pysam_stderr, " -V, --viterbi-training <float> estimate HMM parameters, <float> is the convergence threshold, e.g. 1e-10 (experimental)\n");
fprintf(pysam_stderr, "\n");
exit(1);
}
{"AF-tag",1,0,0},
{"AF-file",1,0,1},
{"AF-dflt",1,0,2},
+ {"buffer-size",1,0,'b'},
+ {"ignore-homref",0,0,'i'},
{"estimate-AF",1,0,'e'},
+ {"output",1,0,'o'},
+ {"output-type",1,0,'O'},
{"GTs-only",1,0,'G'},
- {"sample",1,0,'s'},
+ {"samples",1,0,'s'},
+ {"samples-file",1,0,'S'},
{"hw-to-az",1,0,'a'},
{"az-to-hw",1,0,'H'},
- {"viterbi-training",0,0,'V'},
+ {"viterbi-training",1,0,'V'},
{"targets",1,0,'t'},
{"targets-file",1,0,'T'},
{"regions",1,0,'r'},
{"genetic-map",1,0,'m'},
{"rec-rate",1,0,'M'},
{"skip-indels",0,0,'I'},
+ {"threads",1,0,9},
{0,0,0,0}
};
int naf_opts = 0;
char *tmp;
- while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:S:m:M:G:Ia:e:V:b:O:o:i",loptions,NULL)) >= 0) {
switch (c) {
case 0: args->af_tag = optarg; naf_opts++; break;
case 1: args->af_fname = optarg; naf_opts++; break;
args->dflt_AF = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg);
break;
+ case 'o': args->output_fname = optarg; break;
+ case 'O':
+ if ( strchr(optarg,'s') || strchr(optarg,'S') ) args->output_type |= OUTPUT_ST;
+ if ( strchr(optarg,'r') || strchr(optarg,'R') ) args->output_type |= OUTPUT_RG;
+ if ( strchr(optarg,'z') || strchr(optarg,'z') ) args->output_type |= OUTPUT_GZ;
+ break;
case 'e': args->estimate_AF = optarg; naf_opts++; break;
+ case 'b': args->buffer_size = optarg; break;
+ case 'i': args->skip_homref = 1; break;
case 'I': args->snps_only = 1; break;
case 'G':
args->fake_PLs = 1;
args->rec_rate = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -M %s\n", optarg);
break;
- case 's': args->sample = strdup(optarg); break;
+ case 's': args->samples = strdup(optarg); break;
+ case 'S': args->samples = strdup(optarg); args->samples_is_file = 1; break;
case 'a':
args->t2AZ = strtod(optarg,&tmp);
if ( *tmp ) error("Could not parse: -a %s\n", optarg);
case 'T': args->targets_list = optarg; targets_is_file = 1; break;
case 'r': args->regions_list = optarg; break;
case 'R': args->regions_list = optarg; regions_is_file = 1; break;
- case 'V': args->vi_training = 1; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
+ case 'V':
+ args->vi_training = 1;
+ args->baum_welch_th = strtod(optarg,&tmp);
+ if ( *tmp ) error("Could not parse: --viterbi-training %s\n", optarg);
+ break;
case 'h':
case '?': usage(args); break;
default: error("Unknown argument: %s\n", optarg);
}
}
+ if ( !args->output_fname ) args->output_fname = "pysam_stdout";
+ if ( !args->output_type ) args->output_type = OUTPUT_ST|OUTPUT_RG;
+ char *fname = NULL;
+ if ( optind==argc )
+ {
+ if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin
+ else usage(args);
+ }
+ else fname = argv[optind];
- if ( argc<optind+1 ) usage(args);
+ if ( args->vi_training && args->buffer_size ) error("Error: cannot use -b with -V\n");
if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ);
if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW);
if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n");
if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 )
error("Failed to read the targets: %s\n", args->af_fname);
}
- if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum));
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+ if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
while ( bcf_sr_next_line(args->files) )
vcfroh(args, args->files->readers[0].buffer[0]);
}
vcfroh(args, NULL);
- fprintf(pysam_stderr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused);
+ int i, nmin = 0;
+ for (i=0; i<args->roh_smpl->n; i++)
+ if ( !i || args->smpl[i].nused < nmin ) nmin = args->smpl[i].nused;
+ fprintf(pysam_stderr,"Number of lines total/processed: %d/%d\n", args->ntot,nmin);
+ if ( nmin==0 )
+ {
+ fprintf(pysam_stderr,"No usable sites were found.");
+ if ( !naf_opts && !args->dflt_AF ) fprintf(pysam_stderr, " Consider using one of the AF options.\n");
+ }
destroy_data(args);
free(args);
return 0;
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <inttypes.h>
#include "bcftools.h"
#include "filter.h"
+#include "bin.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
}
idist_t;
-typedef struct
-{
- double x;
- double x2;
- double y;
- double y2;
- double xy;
- double n;
-}
-smpl_r_t;
-
typedef struct
{
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
typedef struct
{
- uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches
- float r2sum;
- uint32_t r2n;
+ uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats
+ /*
+ Pearson's R^2 is used for aggregate R^2
+ y, yy .. sum of dosage and squared dosage in the query VCF (second file)
+ x, xx .. sum of squared dosage in the truth VCF (first file)
+ n .. number of genotypes
+ */
+ double y, yy, x, xx, yx, n;
}
gtcmp_t;
int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm;
uint8_t *tmp_frm;
int dp_min, dp_max, dp_step;
- gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons
+ gtcmp_t *smpl_gts_snps, *smpl_gts_indels;
+ gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
+ bin_t *af_bins;
+ float *farr;
+ int mfarr;
// indel context
indel_ctx_t *indel_ctx;
// other
bcf_srs_t *files;
bcf_sr_regions_t *exons;
- char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
+ char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag;
int argc, verbose_sites, first_allele_only, samples_is_file;
int split_by_id, nstats;
filter_t *filter[2];
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
-
- // Per Sample r working data arrays of size equal to number of samples
- smpl_r_t* smpl_r_snps;
- smpl_r_t* smpl_r_indels;
+ int n_threads;
}
args_t;
-static int type2dosage[6], type2ploidy[6], type2stats[6];
+static int type2dosage[6], type2ploidy[6], type2stats[7];
static void idist_init(idist_t *d, int min, int max, int step)
{
return i-1+d->min;
}
+static inline int clip_nonnegative(float x, int limit)
+{
+ if (x >= limit || isnan(x)) return limit - 1;
+ else if (x <= 0.0) return 0;
+ else return (int) x;
+}
#define IC_DBG 0
#if IC_DBG
args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
if ( args->files->nreaders==2 )
args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
+ args->files->max_unpack |= filter_max_unpack(args->filter[0]);
+ }
+
+ // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs
+ if ( !args->af_bins_list )
+ {
+ args->m_af = 101;
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
+ args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ }
+ else
+ {
+ args->af_bins = bin_init(args->af_bins_list,0,1);
+
+ // m_af is used also for other af arrays, where the first bin is for
+ // singletons. However, since the last element is unused in af_bins
+ // (n boundaries form n-1 intervals), the m_af count is good for both.
+ args->m_af = bin_get_size(args->af_bins);
}
- // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
- args->m_af = 101;
- for (i=0; i<args->files->nreaders; i++)
- if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
- args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
+ error("No such INFO tag: %s\n", args->af_tag);
#if QUAL_STATS
args->m_qual = 999;
args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
- args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
- args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
}
for (i=0; i<args->nstats; i++)
{
type2stats[GT_HOM_RR] = 0;
type2stats[GT_HET_RA] = 1;
type2stats[GT_HOM_AA] = 2;
- type2stats[GT_HET_AA] = 1;
+ type2stats[GT_HET_AA] = 3;
type2stats[GT_HAPL_R] = 0;
type2stats[GT_HAPL_A] = 2;
+ type2stats[GT_UNKN] = 4;
}
static void destroy_stats(args_t *args)
if (stats->qual_indels) free(stats->qual_indels);
#endif
#if HWE_STATS
- //if ( args->files->n_smpl ) free(stats->af_hwe);
free(stats->af_hwe);
#endif
free(stats->insertions);
if ( args->exons ) free(stats->smpl_frm_shifts);
}
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
+ if ( args->af_bins ) bin_destroy(args->af_bins);
+ free(args->farr);
free(args->usr);
free(args->tmp_frm);
free(args->tmp_iaf);
free(args->af_gts_indels);
free(args->smpl_gts_snps);
free(args->smpl_gts_indels);
- free(args->smpl_r_snps);
- free(args->smpl_r_indels);
if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
if (args->filter[0]) filter_destroy(args->filter[0]);
if (args->filter[1]) filter_destroy(args->filter[1]);
static void init_iaf(args_t *args, bcf_sr_t *reader)
{
bcf1_t *line = reader->buffer[0];
- if ( args->ntmp_iaf < line->n_allele )
+ hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf);
+
+ int i, ret;
+ if ( args->af_tag )
{
- args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int));
- args->ntmp_iaf = line->n_allele;
+ ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr);
+ if ( ret<=0 || ret!=line->n_allele-1 )
+ {
+ // the AF tag is not present or wrong number of values, put in the singletons/unknown bin
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0;
+ return;
+ }
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ float af = args->farr[i-1];
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons
+ }
+ return;
}
+
// tmp_iaf is first filled with AC counts in calc_ac and then transformed to
// an index to af_gts_snps
- int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
- if ( ret )
+ ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
+ if ( !ret )
{
- int an=0;
- for (i=0; i<line->n_allele; i++)
- an += args->tmp_iaf[i];
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin
+ return;
+ }
- args->tmp_iaf[0] = 0;
- for (i=1; i<line->n_allele; i++)
+ int an = 0;
+ for (i=0; i<line->n_allele; i++)
+ an += args->tmp_iaf[i];
+
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->tmp_iaf[i]==1 )
+ args->tmp_iaf[i] = 0; // singletons into the first bin
+ else if ( !an )
+ args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
+ else
{
- if ( args->tmp_iaf[i]==1 )
- args->tmp_iaf[i] = 0; // singletons into the first bin
- else if ( !an )
- args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
- else
- args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an;
+ float af = (float) args->tmp_iaf[i] / an;
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1;
}
}
- else
- for (i=0; i<line->n_allele; i++)
- args->tmp_iaf[i] = 0;
-
- // todo: otherwise use AF
}
static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
bcf1_t *line = reader->buffer[0];
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_indels[iqual]++;
#endif
if ( ref<0 ) return;
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_snps[iqual]++;
#endif
{
float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
int idx = het_frac*(args->naf_hwe - 1);
+//check me: what is this?
if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
stats->af_hwe[idx]++;
}
fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return;
// only the first ALT allele is considered
- int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1;
+ int iaf = args->tmp_iaf[1];
int line_type = bcf_get_variant_types(files->readers[0].buffer[0]);
gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
- //
- // Calculates r squared
- // x is mean dosage of x at given site
- // x2 is mean squared dosage of x at given site
- // y is mean dosage of x at given site
- // y2 is mean squared dosage of x at given site
- // xy is mean dosage of x*y at given site
- // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
- // r2n is number of sites considered
- // output as r2sum/r2n for each AF bin
- int r2n = 0;
- float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
- // Select smpl_r
- smpl_r_t *smpl_r = NULL;
- if (line_type&VCF_SNP)
- {
- smpl_r = args->smpl_r_snps;
- }
- else if (line_type&VCF_INDEL)
- {
- smpl_r = args->smpl_r_indels;
- }
for (is=0; is<files->n_smpl; is++)
{
// Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
// actual alleles can be enforced by running without the -c option.
int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
- if ( gt0 == GT_UNKN ) continue;
-
int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
- if ( gt1 == GT_UNKN ) continue;
- if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
+ int idx0 = type2stats[gt0];
+ int idx1 = type2stats[gt1];
+ af_stats[iaf].gt2gt[idx0][idx1]++;
+ smpl_stats[is].gt2gt[idx0][idx1]++;
- int dsg0 = type2dosage[gt0];
- int dsg1 = type2dosage[gt1];
- x += dsg0;
- x2 += dsg0*dsg0;
- y += dsg1;
- y2 += dsg1*dsg1;
- xy += dsg0*dsg1;
- r2n++;
-
- int idx = type2stats[gt0];
- if ( gt0==gt1 )
- {
- af_stats[iaf].m[idx]++;
- smpl_stats[is].m[idx]++;
- }
- else
- {
- af_stats[iaf].mm[idx]++;
- smpl_stats[is].mm[idx]++;
- }
-
- // Now do it across samples
+ if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue;
+ if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
- if (smpl_r) {
- smpl_r[is].xy += dsg0*dsg1;
- smpl_r[is].x += dsg0;
- smpl_r[is].x2 += dsg0*dsg0;
- smpl_r[is].y += dsg1;
- smpl_r[is].y2 += dsg1*dsg1;
- ++(smpl_r[is].n);
- }
- }
-
- if ( r2n )
- {
- x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
- float cov = xy - x*y;
- float var2 = (x2 - x*x) * (y2 - y*y);
- if ( var2!=0 )
- {
- af_stats[iaf].r2sum += cov*cov/var2;
- af_stats[iaf].r2n++;
- }
+ float y = type2dosage[gt0];
+ float x = type2dosage[gt1];
+
+ smpl_stats[is].yx += y*x;
+ smpl_stats[is].x += x;
+ smpl_stats[is].xx += x*x;
+ smpl_stats[is].y += y;
+ smpl_stats[is].yy += y*y;
+ smpl_stats[is].n += 1;
+
+ af_stats[iaf].yx += y*x;
+ af_stats[iaf].x += x;
+ af_stats[iaf].xx += x*x;
+ af_stats[iaf].y += y;
+ af_stats[iaf].yy += y*y;
+ af_stats[iaf].n += 1;
}
if ( args->verbose_sites )
#define T2S(x) type2stats[x]
static void print_stats(args_t *args)
{
- int i, id;
+ int i, j,k, id;
printf("# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
printf("SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
stats->af_repeats[1][1] += stats->af_repeats[1][0];
stats->af_repeats[2][1] += stats->af_repeats[2][0];
}
+ // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf
+ if ( args->af_gts_snps )
+ {
+ args->af_gts_snps[1].y += args->af_gts_snps[0].y;
+ args->af_gts_snps[1].yy += args->af_gts_snps[0].yy;
+ args->af_gts_snps[1].xx += args->af_gts_snps[0].xx;
+ args->af_gts_snps[1].yx += args->af_gts_snps[0].yx;
+ args->af_gts_snps[1].n += args->af_gts_snps[0].n;
+ }
+ if ( args->af_gts_indels )
+ {
+ args->af_gts_indels[1].y += args->af_gts_indels[0].y;
+ args->af_gts_indels[1].yy += args->af_gts_indels[0].yy;
+ args->af_gts_indels[1].xx += args->af_gts_indels[0].xx;
+ args->af_gts_indels[1].yx += args->af_gts_indels[0].yx;
+ args->af_gts_indels[1].n += args->af_gts_indels[0].n;
+ }
+
printf("# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
for (id=0; id<args->nstats; id++)
{
for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
{
if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
- printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ printf("AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
}
}
printf("SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
int x;
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
if ( x==0 )
{
- printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ printf("# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_snps;
}
else
{
- printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ printf("# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_indels;
}
- uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
+ uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins
for (i=0; i<args->m_af; i++)
{
- int j, n = 0;
- for (j=0; j<3; j++)
+ int n = 0;
+ uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin
+ for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./.
+ for (k=0; k<4; k++)
+ {
+ n += stats[i].gt2gt[j][k];
+ if ( j==k )
+ {
+ nrd_m[j] += stats[i].gt2gt[j][k];
+ m[j] += stats[i].gt2gt[j][k];
+ }
+ else
+ {
+ nrd_mm[j] += stats[i].gt2gt[j][k];
+ mm[j] += stats[i].gt2gt[j][k];
+ }
+ }
+ if ( !i || !n ) continue; // skip singleton stats and empty bins
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
{
- n += stats[i].m[j] + stats[i].mm[j];
- nrd_m[j] += stats[i].m[j];
- nrd_mm[j] += stats[i].mm[j];
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
- if ( !i || !n ) continue; // skip singleton stats and empty bins
- printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- printf("\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ printf("GC%cAF\t2\t%f", x==0 ? 's' : 'i', af);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]);
+ if ( stats[i].n && !isnan(r2) ) printf("\t%f", r2);
+ else printf("\t"NA_STRING);
+ printf("\t%.0f\n", stats[i].n);
}
if ( x==0 )
}
else
printf("# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
- uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
- uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
+ uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)];
+ uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)];
printf("NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
);
}
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
- smpl_r_t *smpl_r_array;
if ( x==0 )
{
printf("# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_snps;
- smpl_r_array = args->smpl_r_snps;
}
else
{
printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_indels;
- smpl_r_array = args->smpl_r_indels;
}
for (i=0; i<args->files->n_smpl; i++)
{
- uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
- uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
- // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
- smpl_r_t *smpl_r = smpl_r_array + i;
- double r = 0.0;
- if (smpl_r->n) {
- double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
- double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
- double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
- r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
+ uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)];
+ for (j=0; j<3; j++)
+ for (k=0; k<3; k++)
+ if ( j!=k ) mm += stats[i].gt2gt[j][k];
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
+ {
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
printf("GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- if (smpl_r->n && !isnan(r)) printf("\t%f\n", r*r);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]);
+ printf("\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]);
+ if ( stats[i].n && !isnan(r2) ) printf("\t%f\n", r2);
else printf("\t"NA_STRING"\n");
}
}
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
+ {
+ //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+
+ gtcmp_t *stats;
+ if ( x==0 )
+ {
+ printf("# GCTs, Genotype concordance table (SNPs)\n# GCTs");
+ stats = args->smpl_gts_snps;
+ }
+ else
+ {
+ printf("# GCTi, Genotype concordance table (indels)\n# GCTi");
+ stats = args->smpl_gts_indels;
+ }
+ i = 1;
+ printf("\t[%d]sample", ++i);
+ printf("\t[%d]RR Hom -> RR Hom", ++i);
+ printf("\t[%d]RR Hom -> RA Het", ++i);
+ printf("\t[%d]RR Hom -> AA Hom", ++i);
+ printf("\t[%d]RR Hom -> AA Het", ++i);
+ printf("\t[%d]RR Hom -> missing", ++i);
+ printf("\t[%d]RA Het -> RR Hom", ++i);
+ printf("\t[%d]RA Het -> RA Het", ++i);
+ printf("\t[%d]RA Het -> AA Hom", ++i);
+ printf("\t[%d]RA Het -> AA Het", ++i);
+ printf("\t[%d]RA Het -> missing", ++i);
+ printf("\t[%d]AA Hom -> RR Hom", ++i);
+ printf("\t[%d]AA Hom -> RA Het", ++i);
+ printf("\t[%d]AA Hom -> AA Hom", ++i);
+ printf("\t[%d]AA Hom -> AA Het", ++i);
+ printf("\t[%d]AA Hom -> missing", ++i);
+ printf("\t[%d]AA Het -> RR Hom", ++i);
+ printf("\t[%d]AA Het -> RA Het", ++i);
+ printf("\t[%d]AA Het -> AA Hom", ++i);
+ printf("\t[%d]AA Het -> AA Het", ++i);
+ printf("\t[%d]AA Het -> missing", ++i);
+ printf("\t[%d]missing -> RR Hom", ++i);
+ printf("\t[%d]missing -> RA Het", ++i);
+ printf("\t[%d]missing -> AA Hom", ++i);
+ printf("\t[%d]missing -> AA Het", ++i);
+ printf("\t[%d]missing -> missing\n", ++i);
+
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ printf("GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]);
+ for (j=0; j<5; j++)
+ for (k=0; k<5; k++)
+ printf("\t%"PRId64, stats[i].gt2gt[j][k]);
+ printf("\n");
+ }
+ }
}
printf("# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
for (j=0; j<args->naf_hwe; j++) sum_tot += ptr[j];
if ( !sum_tot ) continue;
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+
int nprn = 3;
- printf("HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ printf("HWE\t%d\t%f\t%d",id,af,sum_tot);
for (j=0; j<args->naf_hwe; j++)
{
sum_tmp += ptr[j];
fprintf(stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Options:\n");
+ fprintf(stderr, " --af-bins <list> allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+ fprintf(stderr, " --af-tag <string> allele frequency tag to use, by default estimated from AN,AC or GT\n");
fprintf(stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
fprintf(stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
fprintf(stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
fprintf(stderr, "\n");
exit(1);
static struct option loptions[] =
{
+ {"af-bins",1,0,1},
+ {"af-tag",1,0,2},
{"1st-allele-only",0,0,'1'},
{"include",1,0,'i'},
{"exclude",1,0,'e'},
{"targets-file",1,0,'T'},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
+ {"threads",1,0,9},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
switch (c) {
+ case 1 : args->af_bins_list = optarg; break;
+ case 2 : args->af_tag = optarg; break;
case 'u': add_user_stats(args,optarg); break;
case '1': args->first_allele_only = 1; break;
case 'F': args->ref_fname = optarg; break;
case 'I': args->split_by_id = 1; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
error("Failed to read the targets: %s\n", args->targets_list);
if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+
while (fname)
{
if ( !bcf_sr_add_reader(args->files, fname) )
/* vcfstats.c -- Produces stats which can be plotted using plot-vcfstats.
- Copyright (C) 2012-2015 Genome Research Ltd.
+ Copyright (C) 2012-2016 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
#include <inttypes.h>
#include "bcftools.h"
#include "filter.h"
+#include "bin.h"
// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
}
idist_t;
-typedef struct
-{
- double x;
- double x2;
- double y;
- double y2;
- double xy;
- double n;
-}
-smpl_r_t;
-
typedef struct
{
int n_snps, n_indels, n_mnps, n_others, n_mals, n_snp_mals, n_records, n_noalts;
typedef struct
{
- uint64_t m[3], mm[3]; // number of hom, het and non-ref hom matches and mismatches
- float r2sum;
- uint32_t r2n;
+ uint64_t gt2gt[5][5]; // number of RR->RR, RR->RA, etc. matches/mismatches; see type2stats
+ /*
+ Pearson's R^2 is used for aggregate R^2
+ y, yy .. sum of dosage and squared dosage in the query VCF (second file)
+ x, xx .. sum of squared dosage in the truth VCF (first file)
+ n .. number of genotypes
+ */
+ double y, yy, x, xx, yx, n;
}
gtcmp_t;
int *tmp_iaf, ntmp_iaf, m_af, m_qual, naf_hwe, mtmp_frm;
uint8_t *tmp_frm;
int dp_min, dp_max, dp_step;
- gtcmp_t *af_gts_snps, *af_gts_indels, *smpl_gts_snps, *smpl_gts_indels; // first bin of af_* stats are singletons
+ gtcmp_t *smpl_gts_snps, *smpl_gts_indels;
+ gtcmp_t *af_gts_snps, *af_gts_indels; // first bin of af_* stats are singletons
+ bin_t *af_bins;
+ float *farr;
+ int mfarr;
// indel context
indel_ctx_t *indel_ctx;
// other
bcf_srs_t *files;
bcf_sr_regions_t *exons;
- char **argv, *exons_fname, *regions_list, *samples_list, *targets_list;
+ char **argv, *exons_fname, *regions_list, *samples_list, *targets_list, *af_bins_list, *af_tag;
int argc, verbose_sites, first_allele_only, samples_is_file;
int split_by_id, nstats;
filter_t *filter[2];
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE
-
- // Per Sample r working data arrays of size equal to number of samples
- smpl_r_t* smpl_r_snps;
- smpl_r_t* smpl_r_indels;
+ int n_threads;
}
args_t;
-static int type2dosage[6], type2ploidy[6], type2stats[6];
+static int type2dosage[6], type2ploidy[6], type2stats[7];
static void idist_init(idist_t *d, int min, int max, int step)
{
return i-1+d->min;
}
+static inline int clip_nonnegative(float x, int limit)
+{
+ if (x >= limit || isnan(x)) return limit - 1;
+ else if (x <= 0.0) return 0;
+ else return (int) x;
+}
#define IC_DBG 0
#if IC_DBG
args->filter[0] = filter_init(bcf_sr_get_header(args->files,0), args->filter_str);
if ( args->files->nreaders==2 )
args->filter[1] = filter_init(bcf_sr_get_header(args->files,1), args->filter_str);
+ args->files->max_unpack |= filter_max_unpack(args->filter[0]);
+ }
+
+ // AF corresponds to AC but is more robust to mixtures of haploid and diploid GTs
+ if ( !args->af_bins_list )
+ {
+ args->m_af = 101;
+ for (i=0; i<args->files->nreaders; i++)
+ if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
+ args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ }
+ else
+ {
+ args->af_bins = bin_init(args->af_bins_list,0,1);
+
+ // m_af is used also for other af arrays, where the first bin is for
+ // singletons. However, since the last element is unused in af_bins
+ // (n boundaries form n-1 intervals), the m_af count is good for both.
+ args->m_af = bin_get_size(args->af_bins);
}
- // AF corresponds to AC but is more robust for mixture of haploid and diploid GTs
- args->m_af = 101;
- for (i=0; i<args->files->nreaders; i++)
- if ( bcf_hdr_nsamples(args->files->readers[i].header) + 1> args->m_af )
- args->m_af = bcf_hdr_nsamples(args->files->readers[i].header) + 1;
+ bcf_hdr_t *hdr = bcf_sr_get_header(args->files,0);
+ if ( args->af_tag && !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,bcf_hdr_id2int(hdr,BCF_DT_ID,args->af_tag)) )
+ error("No such INFO tag: %s\n", args->af_tag);
#if QUAL_STATS
args->m_qual = 999;
args->af_gts_indels = (gtcmp_t *) calloc(args->m_af,sizeof(gtcmp_t));
args->smpl_gts_snps = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
args->smpl_gts_indels = (gtcmp_t *) calloc(args->files->n_smpl,sizeof(gtcmp_t));
- args->smpl_r_snps = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
- args->smpl_r_indels = (smpl_r_t*) calloc(args->files->n_smpl, sizeof(smpl_r_t));
}
for (i=0; i<args->nstats; i++)
{
type2stats[GT_HOM_RR] = 0;
type2stats[GT_HET_RA] = 1;
type2stats[GT_HOM_AA] = 2;
- type2stats[GT_HET_AA] = 1;
+ type2stats[GT_HET_AA] = 3;
type2stats[GT_HAPL_R] = 0;
type2stats[GT_HAPL_A] = 2;
+ type2stats[GT_UNKN] = 4;
}
static void destroy_stats(args_t *args)
if (stats->qual_indels) free(stats->qual_indels);
#endif
#if HWE_STATS
- //if ( args->files->n_smpl ) free(stats->af_hwe);
free(stats->af_hwe);
#endif
free(stats->insertions);
if ( args->exons ) free(stats->smpl_frm_shifts);
}
for (j=0; j<args->nusr; j++) free(args->usr[j].tag);
+ if ( args->af_bins ) bin_destroy(args->af_bins);
+ free(args->farr);
free(args->usr);
free(args->tmp_frm);
free(args->tmp_iaf);
free(args->af_gts_indels);
free(args->smpl_gts_snps);
free(args->smpl_gts_indels);
- free(args->smpl_r_snps);
- free(args->smpl_r_indels);
if (args->indel_ctx) indel_ctx_destroy(args->indel_ctx);
if (args->filter[0]) filter_destroy(args->filter[0]);
if (args->filter[1]) filter_destroy(args->filter[1]);
static void init_iaf(args_t *args, bcf_sr_t *reader)
{
bcf1_t *line = reader->buffer[0];
- if ( args->ntmp_iaf < line->n_allele )
+ hts_expand(int32_t,line->n_allele,args->ntmp_iaf,args->tmp_iaf);
+
+ int i, ret;
+ if ( args->af_tag )
{
- args->tmp_iaf = (int*)realloc(args->tmp_iaf, line->n_allele*sizeof(int));
- args->ntmp_iaf = line->n_allele;
+ ret = bcf_get_info_float(reader->header, line, args->af_tag, &args->farr, &args->mfarr);
+ if ( ret<=0 || ret!=line->n_allele-1 )
+ {
+ // the AF tag is not present or wrong number of values, put in the singletons/unknown bin
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0;
+ return;
+ }
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ float af = args->farr[i-1];
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1; // the first tmp_iaf bin is reserved for singletons
+ }
+ return;
}
+
// tmp_iaf is first filled with AC counts in calc_ac and then transformed to
// an index to af_gts_snps
- int i, ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
- if ( ret )
+ ret = bcf_calc_ac(reader->header, line, args->tmp_iaf, args->samples_list ? BCF_UN_INFO|BCF_UN_FMT : BCF_UN_INFO);
+ if ( !ret )
{
- int an=0;
- for (i=0; i<line->n_allele; i++)
- an += args->tmp_iaf[i];
+ for (i=0; i<line->n_allele; i++) args->tmp_iaf[i] = 0; // singletons/unknown bin
+ return;
+ }
- args->tmp_iaf[0] = 0;
- for (i=1; i<line->n_allele; i++)
+ int an = 0;
+ for (i=0; i<line->n_allele; i++)
+ an += args->tmp_iaf[i];
+
+ args->tmp_iaf[0] = 0;
+ for (i=1; i<line->n_allele; i++)
+ {
+ if ( args->tmp_iaf[i]==1 )
+ args->tmp_iaf[i] = 0; // singletons into the first bin
+ else if ( !an )
+ args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
+ else
{
- if ( args->tmp_iaf[i]==1 )
- args->tmp_iaf[i] = 0; // singletons into the first bin
- else if ( !an )
- args->tmp_iaf[i] = 1; // no genotype at all, put to the AF=0 bin
- else
- args->tmp_iaf[i] = 1 + args->tmp_iaf[i] * (args->m_af-2.0) / an;
+ float af = (float) args->tmp_iaf[i] / an;
+ if ( af<0 ) af = 0;
+ else if ( af>1 ) af = 1;
+ int iaf = args->af_bins ? bin_get_idx(args->af_bins,af) : af*(args->m_af-2);
+ args->tmp_iaf[i] = iaf + 1;
}
}
- else
- for (i=0; i<line->n_allele; i++)
- args->tmp_iaf[i] = 0;
-
- // todo: otherwise use AF
}
static inline void do_mnp_stats(args_t *args, stats_t *stats, bcf_sr_t *reader)
bcf1_t *line = reader->buffer[0];
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_indels[iqual]++;
#endif
if ( ref<0 ) return;
#if QUAL_STATS
- int iqual = line->qual >= args->m_qual || isnan(line->qual) ? args->m_qual - 1 : line->qual;
+ int iqual = clip_nonnegative(line->qual, args->m_qual);
stats->qual_snps[iqual]++;
#endif
{
float het_frac = (float)nhet_tot/(nhet_tot + nref_tot + nalt_tot);
int idx = het_frac*(args->naf_hwe - 1);
+//check me: what is this?
if ( line->n_allele>1 ) idx += args->naf_hwe*args->tmp_iaf[1];
stats->af_hwe[idx]++;
}
fmt1 = bcf_get_fmt(files->readers[1].header,files->readers[1].buffer[0],"GT"); if ( !fmt1 ) return;
// only the first ALT allele is considered
- int iaf = line->n_allele>1 ? args->tmp_iaf[1] : 1;
+ int iaf = args->tmp_iaf[1];
int line_type = bcf_get_variant_types(files->readers[0].buffer[0]);
gtcmp_t *af_stats = line_type&VCF_SNP ? args->af_gts_snps : args->af_gts_indels;
gtcmp_t *smpl_stats = line_type&VCF_SNP ? args->smpl_gts_snps : args->smpl_gts_indels;
- //
- // Calculates r squared
- // x is mean dosage of x at given site
- // x2 is mean squared dosage of x at given site
- // y is mean dosage of x at given site
- // y2 is mean squared dosage of x at given site
- // xy is mean dosage of x*y at given site
- // r2sum += (xy - x*y)^2 / ( (x2 - x^2) * (y2 - y^2) )
- // r2n is number of sites considered
- // output as r2sum/r2n for each AF bin
- int r2n = 0;
- float x = 0, y = 0, xy = 0, x2 = 0, y2 = 0;
- // Select smpl_r
- smpl_r_t *smpl_r = NULL;
- if (line_type&VCF_SNP)
- {
- smpl_r = args->smpl_r_snps;
- }
- else if (line_type&VCF_INDEL)
- {
- smpl_r = args->smpl_r_indels;
- }
for (is=0; is<files->n_smpl; is++)
{
// Simplified comparison: only 0/0, 0/1, 1/1 is looked at as the identity of
// actual alleles can be enforced by running without the -c option.
int gt0 = bcf_gt_type(fmt0, files->readers[0].samples[is], NULL, NULL);
- if ( gt0 == GT_UNKN ) continue;
-
int gt1 = bcf_gt_type(fmt1, files->readers[1].samples[is], NULL, NULL);
- if ( gt1 == GT_UNKN ) continue;
- if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
+ int idx0 = type2stats[gt0];
+ int idx1 = type2stats[gt1];
+ af_stats[iaf].gt2gt[idx0][idx1]++;
+ smpl_stats[is].gt2gt[idx0][idx1]++;
- int dsg0 = type2dosage[gt0];
- int dsg1 = type2dosage[gt1];
- x += dsg0;
- x2 += dsg0*dsg0;
- y += dsg1;
- y2 += dsg1*dsg1;
- xy += dsg0*dsg1;
- r2n++;
-
- int idx = type2stats[gt0];
- if ( gt0==gt1 )
- {
- af_stats[iaf].m[idx]++;
- smpl_stats[is].m[idx]++;
- }
- else
- {
- af_stats[iaf].mm[idx]++;
- smpl_stats[is].mm[idx]++;
- }
-
- // Now do it across samples
+ if ( gt0 == GT_UNKN || gt1 == GT_UNKN ) continue;
+ if ( type2ploidy[gt0]*type2ploidy[gt1] == -1 ) continue; // cannot compare diploid and haploid genotypes
- if (smpl_r) {
- smpl_r[is].xy += dsg0*dsg1;
- smpl_r[is].x += dsg0;
- smpl_r[is].x2 += dsg0*dsg0;
- smpl_r[is].y += dsg1;
- smpl_r[is].y2 += dsg1*dsg1;
- ++(smpl_r[is].n);
- }
- }
-
- if ( r2n )
- {
- x /= r2n; y /= r2n; x2 /= r2n; y2 /= r2n; xy /= r2n;
- float cov = xy - x*y;
- float var2 = (x2 - x*x) * (y2 - y*y);
- if ( var2!=0 )
- {
- af_stats[iaf].r2sum += cov*cov/var2;
- af_stats[iaf].r2n++;
- }
+ float y = type2dosage[gt0];
+ float x = type2dosage[gt1];
+
+ smpl_stats[is].yx += y*x;
+ smpl_stats[is].x += x;
+ smpl_stats[is].xx += x*x;
+ smpl_stats[is].y += y;
+ smpl_stats[is].yy += y*y;
+ smpl_stats[is].n += 1;
+
+ af_stats[iaf].yx += y*x;
+ af_stats[iaf].x += x;
+ af_stats[iaf].xx += x*x;
+ af_stats[iaf].y += y;
+ af_stats[iaf].yy += y*y;
+ af_stats[iaf].n += 1;
}
if ( args->verbose_sites )
#define T2S(x) type2stats[x]
static void print_stats(args_t *args)
{
- int i, id;
+ int i, j,k, id;
fprintf(pysam_stdout, "# SN, Summary numbers:\n# SN\t[2]id\t[3]key\t[4]value\n");
for (id=0; id<args->files->nreaders; id++)
fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", id, bcf_hdr_nsamples(args->files->readers[id].header));
stats->af_repeats[1][1] += stats->af_repeats[1][0];
stats->af_repeats[2][1] += stats->af_repeats[2][0];
}
+ // move the singletons stats into the first AF bin, singleton stats was collected separately because of init_iaf
+ if ( args->af_gts_snps )
+ {
+ args->af_gts_snps[1].y += args->af_gts_snps[0].y;
+ args->af_gts_snps[1].yy += args->af_gts_snps[0].yy;
+ args->af_gts_snps[1].xx += args->af_gts_snps[0].xx;
+ args->af_gts_snps[1].yx += args->af_gts_snps[0].yx;
+ args->af_gts_snps[1].n += args->af_gts_snps[0].n;
+ }
+ if ( args->af_gts_indels )
+ {
+ args->af_gts_indels[1].y += args->af_gts_indels[0].y;
+ args->af_gts_indels[1].yy += args->af_gts_indels[0].yy;
+ args->af_gts_indels[1].xx += args->af_gts_indels[0].xx;
+ args->af_gts_indels[1].yx += args->af_gts_indels[0].yx;
+ args->af_gts_indels[1].n += args->af_gts_indels[0].n;
+ }
+
fprintf(pysam_stdout, "# AF, Stats by non-reference allele frequency:\n# AF\t[2]id\t[3]allele frequency\t[4]number of SNPs\t[5]number of transitions\t[6]number of transversions\t[7]number of indels\t[8]repeat-consistent\t[9]repeat-inconsistent\t[10]not applicable\n");
for (id=0; id<args->nstats; id++)
{
for (i=1; i<args->m_af; i++) // note that af[1] now contains also af[0], see SiS stats output above
{
if ( stats->af_snps[i]+stats->af_ts[i]+stats->af_tv[i]+stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i] == 0 ) continue;
- fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,100.*(i-1)/(args->m_af-1),stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ fprintf(pysam_stdout, "AF\t%d\t%f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", id,af,stats->af_snps[i],stats->af_ts[i],stats->af_tv[i],
stats->af_repeats[0][i]+stats->af_repeats[1][i]+stats->af_repeats[2][i],stats->af_repeats[0][i],stats->af_repeats[1][i],stats->af_repeats[2][i]);
}
}
fprintf(pysam_stdout, "SN\t%d\tnumber of samples:\t%d\n", 2, args->files->n_smpl);
int x;
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
if ( x==0 )
{
- fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ fprintf(pysam_stdout, "# GCsAF, Genotype concordance by non-reference allele frequency (SNPs)\n# GCsAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_snps;
}
else
{
- fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of sites\n");
+ fprintf(pysam_stdout, "# GCiAF, Genotype concordance by non-reference allele frequency (indels)\n# GCiAF\t[2]id\t[3]allele frequency\t[4]RR Hom matches\t[5]RA Het matches\t[6]AA Hom matches\t[7]RR Hom mismatches\t[8]RA Het mismatches\t[9]AA Hom mismatches\t[10]dosage r-squared\t[11]number of genotypes\n");
stats = args->af_gts_indels;
}
- uint64_t nrd_m[3] = {0,0,0}, nrd_mm[3] = {0,0,0};
+ uint64_t nrd_m[4] = {0,0,0,0}, nrd_mm[4] = {0,0,0,0}; // across all bins
for (i=0; i<args->m_af; i++)
{
- int j, n = 0;
- for (j=0; j<3; j++)
+ int n = 0;
+ uint64_t m[4] = {0,0,0,0}, mm[4] = {0,0,0,0}; // in i-th AF bin
+ for (j=0; j<4; j++) // rr, ra, aa hom, aa het, ./.
+ for (k=0; k<4; k++)
+ {
+ n += stats[i].gt2gt[j][k];
+ if ( j==k )
+ {
+ nrd_m[j] += stats[i].gt2gt[j][k];
+ m[j] += stats[i].gt2gt[j][k];
+ }
+ else
+ {
+ nrd_mm[j] += stats[i].gt2gt[j][k];
+ mm[j] += stats[i].gt2gt[j][k];
+ }
+ }
+ if ( !i || !n ) continue; // skip singleton stats and empty bins
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
{
- n += stats[i].m[j] + stats[i].mm[j];
- nrd_m[j] += stats[i].m[j];
- nrd_mm[j] += stats[i].mm[j];
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
- if ( !i || !n ) continue; // skip singleton stats and empty bins
- fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', 100.*(i-1)/(args->m_af-1));
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- fprintf(pysam_stdout, "\t%f\t%"PRId32"\n", stats[i].r2n ? stats[i].r2sum/stats[i].r2n : -1.0, stats[i].r2n);
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+ fprintf(pysam_stdout, "GC%cAF\t2\t%f", x==0 ? 's' : 'i', af);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", m[T2S(GT_HOM_RR)],m[T2S(GT_HET_RA)],m[T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", mm[T2S(GT_HOM_RR)],mm[T2S(GT_HET_RA)],mm[T2S(GT_HOM_AA)]);
+ if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f", r2);
+ else fprintf(pysam_stdout, "\t"NA_STRING);
+ fprintf(pysam_stdout, "\t%.0f\n", stats[i].n);
}
if ( x==0 )
}
else
fprintf(pysam_stdout, "# Non-Reference Discordance (NRD), indels\n# NRDi\t[2]id\t[3]NRD\t[4]Ref/Ref discordance\t[5]Ref/Alt discordance\t[6]Alt/Alt discordance\n");
- uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)];
- uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)];
+ uint64_t m = nrd_m[T2S(GT_HET_RA)] + nrd_m[T2S(GT_HOM_AA)] + nrd_m[T2S(GT_HET_AA)];
+ uint64_t mm = nrd_mm[T2S(GT_HOM_RR)] + nrd_mm[T2S(GT_HET_RA)] + nrd_mm[T2S(GT_HOM_AA)] + nrd_mm[T2S(GT_HET_AA)];
fprintf(pysam_stdout, "NRD%c\t2\t%f\t%f\t%f\t%f\n", x==0 ? 's' : 'i',
m+mm ? mm*100.0/(m+mm) : 0,
nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)] ? nrd_mm[T2S(GT_HOM_RR)]*100.0/(nrd_m[T2S(GT_HOM_RR)]+nrd_mm[T2S(GT_HOM_RR)]) : 0,
);
}
- for (x=0; x<2; x++)
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
{
gtcmp_t *stats;
- smpl_r_t *smpl_r_array;
if ( x==0 )
{
fprintf(pysam_stdout, "# GCsS, Genotype concordance by sample (SNPs)\n# GCsS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_snps;
- smpl_r_array = args->smpl_r_snps;
}
else
{
fprintf(pysam_stdout, "# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
stats = args->smpl_gts_indels;
- smpl_r_array = args->smpl_r_indels;
}
for (i=0; i<args->files->n_smpl; i++)
{
- uint64_t m = stats[i].m[T2S(GT_HET_RA)] + stats[i].m[T2S(GT_HOM_AA)];
- uint64_t mm = stats[i].mm[T2S(GT_HOM_RR)] + stats[i].mm[T2S(GT_HET_RA)] + stats[i].mm[T2S(GT_HOM_AA)];
- // Calculate r by formula 19.2 - Biostatistical Analysis 4th edition - Jerrold H. Zar
- smpl_r_t *smpl_r = smpl_r_array + i;
- double r = 0.0;
- if (smpl_r->n) {
- double sum_crossprod = smpl_r->xy-(smpl_r->x*smpl_r->y)/smpl_r->n;//per 17.3 machine formula
- double x2_xx = smpl_r->x2-(smpl_r->x*smpl_r->x)/smpl_r->n;
- double y2_yy = smpl_r->y2-(smpl_r->y*smpl_r->y)/smpl_r->n;
- r = (sum_crossprod)/sqrt(x2_xx*y2_yy);
+ uint64_t mm = 0, m = stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)];
+ for (j=0; j<3; j++)
+ for (k=0; k<3; k++)
+ if ( j!=k ) mm += stats[i].gt2gt[j][k];
+
+ // Pearson's r2
+ double r2 = 0;
+ if ( stats[i].n )
+ {
+ r2 = (stats[i].yx - stats[i].x*stats[i].y/stats[i].n);
+ r2 /= sqrt((stats[i].xx - stats[i].x*stats[i].x/stats[i].n) * (stats[i].yy - stats[i].y*stats[i].y/stats[i].n));
+ r2 *= r2;
}
fprintf(pysam_stdout, "GC%cS\t2\t%s\t%.3f", x==0 ? 's' : 'i', args->files->samples[i], m+mm ? mm*100.0/(m+mm) : 0);
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].m[T2S(GT_HOM_RR)],stats[i].m[T2S(GT_HET_RA)],stats[i].m[T2S(GT_HOM_AA)]);
- fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"", stats[i].mm[T2S(GT_HOM_RR)],stats[i].mm[T2S(GT_HET_RA)],stats[i].mm[T2S(GT_HOM_AA)]);
- if (smpl_r->n && !isnan(r)) fprintf(pysam_stdout, "\t%f\n", r*r);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_RR)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HET_RA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_AA)]);
+ fprintf(pysam_stdout, "\t%"PRId64"\t%"PRId64"\t%"PRId64"",
+ stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HET_RA)] + stats[i].gt2gt[T2S(GT_HOM_RR)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HET_RA)][T2S(GT_HOM_AA)],
+ stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HOM_RR)] + stats[i].gt2gt[T2S(GT_HOM_AA)][T2S(GT_HET_RA)]);
+ if ( stats[i].n && !isnan(r2) ) fprintf(pysam_stdout, "\t%f\n", r2);
else fprintf(pysam_stdout, "\t"NA_STRING"\n");
}
}
+ for (x=0; x<2; x++) // x=0: snps, x=1: indels
+ {
+ //printf("# GCiS, Genotype concordance by sample (indels)\n# GCiS\t[2]id\t[3]sample\t[4]non-reference discordance rate\t[5]RR Hom matches\t[6]RA Het matches\t[7]AA Hom matches\t[8]RR Hom mismatches\t[9]RA Het mismatches\t[10]AA Hom mismatches\t[11]dosage r-squared\n");
+
+ gtcmp_t *stats;
+ if ( x==0 )
+ {
+ fprintf(pysam_stdout, "# GCTs, Genotype concordance table (SNPs)\n# GCTs");
+ stats = args->smpl_gts_snps;
+ }
+ else
+ {
+ fprintf(pysam_stdout, "# GCTi, Genotype concordance table (indels)\n# GCTi");
+ stats = args->smpl_gts_indels;
+ }
+ i = 1;
+ fprintf(pysam_stdout, "\t[%d]sample", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RR Hom -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]RA Het -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Hom -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]AA Het -> missing", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> RR Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> RA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> AA Hom", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> AA Het", ++i);
+ fprintf(pysam_stdout, "\t[%d]missing -> missing\n", ++i);
+
+ for (i=0; i<args->files->n_smpl; i++)
+ {
+ fprintf(pysam_stdout, "GCT%c\t%s", x==0 ? 's' : 'i', args->files->samples[i]);
+ for (j=0; j<5; j++)
+ for (k=0; k<5; k++)
+ fprintf(pysam_stdout, "\t%"PRId64, stats[i].gt2gt[j][k]);
+ fprintf(pysam_stdout, "\n");
+ }
+ }
}
fprintf(pysam_stdout, "# DP, Depth distribution\n# DP\t[2]id\t[3]bin\t[4]number of genotypes\t[5]fraction of genotypes (%%)\t[6]number of sites\t[7]fraction of sites (%%)\n");
for (j=0; j<args->naf_hwe; j++) sum_tot += ptr[j];
if ( !sum_tot ) continue;
+ double af = args->af_bins ? (bin_get_value(args->af_bins,i)+bin_get_value(args->af_bins,i-1))*0.5 : (double)(i-1)/(args->m_af-1);
+
int nprn = 3;
- fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,100.*(i-1)/(args->m_af-1),sum_tot);
+ fprintf(pysam_stdout, "HWE\t%d\t%f\t%d",id,af,sum_tot);
for (j=0; j<args->naf_hwe; j++)
{
sum_tmp += ptr[j];
fprintf(pysam_stderr, "Usage: bcftools stats [options] <A.vcf.gz> [<B.vcf.gz>]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " --af-bins <list> allele frequency bins, a list (0.1,0.5,1) or a file (0.1\\n0.5\\n1)\n");
+ fprintf(pysam_stderr, " --af-tag <string> allele frequency tag to use, by default estimated from AN,AC or GT\n");
fprintf(pysam_stderr, " -1, --1st-allele-only include only 1st allele at multiallelic sites\n");
fprintf(pysam_stderr, " -c, --collapse <string> treat as identical records with <snps|indels|both|all|some|none>, see man page for details [none]\n");
fprintf(pysam_stderr, " -d, --depth <int,int,int> depth distribution: min,max,bin size [0,500,1]\n");
fprintf(pysam_stderr, " -t, --targets <region> similar to -r but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -T, --targets-file <file> similar to -R but streams rather than index-jumps\n");
fprintf(pysam_stderr, " -u, --user-tstv <TAG[:min:max:n]> collect Ts/Tv stats for any tag using the given binning [0:1:100]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra decompression threads [0]\n");
fprintf(pysam_stderr, " -v, --verbose produce verbose per-site and per-sample output\n");
fprintf(pysam_stderr, "\n");
exit(1);
static struct option loptions[] =
{
+ {"af-bins",1,0,1},
+ {"af-tag",1,0,2},
{"1st-allele-only",0,0,'1'},
{"include",1,0,'i'},
{"exclude",1,0,'e'},
{"targets-file",1,0,'T'},
{"fasta-ref",1,0,'F'},
{"user-tstv",1,0,'u'},
+ {"threads",1,0,9},
{0,0,0,0}
};
while ((c = getopt_long(argc, argv, "hc:r:R:e:s:S:d:i:t:T:F:f:1u:vIE:",loptions,NULL)) >= 0) {
switch (c) {
+ case 1 : args->af_bins_list = optarg; break;
+ case 2 : args->af_tag = optarg; break;
case 'u': add_user_stats(args,optarg); break;
case '1': args->first_allele_only = 1; break;
case 'F': args->ref_fname = optarg; break;
case 'I': args->split_by_id = 1; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
+ case 9 : args->n_threads = strtol(optarg, 0, 0); break;
case 'h':
case '?': usage();
default: error("Unknown argument: %s\n", optarg);
error("Failed to read the targets: %s\n", args->targets_list);
if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
error("Failed to read the regions: %s\n", args->regions_list);
+ if ( args->n_threads && bcf_sr_set_threads(args->files, args->n_threads)<0)
+ error("Failed to create threads\n");
+
while (fname)
{
if ( !bcf_sr_add_reader(args->files, fname) )
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
if (args->include_types) {
args->include = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
else {
fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
if (args->exclude_types) {
args->exclude = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
else {
fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out, args->n_threads);
+ if ( args->n_threads > 0)
+ hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
// headers: hdr=full header, hsub=subset header, hnull=sites only header
if (args->sites_only){
if (args->include || args->exclude)
{
int line_type = bcf_get_variant_types(line);
- if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
- if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types
+ if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
+ if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types
}
if ( args->filter )
}
}
- if (args->min_ac)
+ if (args->min_ac!=-1)
{
if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
}
- if (args->max_ac)
+ if (args->max_ac!=-1)
{
if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
}
- if (args->min_af)
+ if (args->min_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
}
- if (args->max_af)
+ if (args->max_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
if (args->trim_alts)
{
int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
- if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
+ if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
}
if (args->phased) {
int phased = bcf_all_phased(args->hdr, line);
fprintf(stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
fprintf(stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Subset options:\n");
fprintf(stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
fprintf(stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
fprintf(stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
fprintf(stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(stderr, "\n");
exit(1);
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
error("Failed to read the targets: %s\n", args->targets_list);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
bcf_hdr_write(args->out, out_hdr);
else if ( args->output_type & FT_BCF )
error("BCF output requires header, cannot proceed with -H\n");
+
+ int ret = 0;
if (!args->header_only)
{
while ( bcf_sr_next_line(args->files) )
if ( subset_vcf(args, line) )
bcf_write1(args->out, out_hdr, line);
}
+ ret = args->files->errnum;
+ if ( ret ) fprintf(stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
}
hts_close(args->out);
destroy_data(args);
bcf_sr_destroy(args->files);
free(args);
- return 0;
+ return ret;
}
THE SOFTWARE. */
#include <stdio.h>
+#include <strings.h>
#include <unistd.h>
#include <getopt.h>
#include <ctype.h>
if (args->include_types) {
args->include = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->include |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->include |= VCF_BND<<1;
else {
fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n");
if (args->exclude_types) {
args->exclude = 0;
for (i = 0; i < n; ++i) {
- if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
- else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
- else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
- else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
+ if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP<<1;
+ else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL<<1;
+ else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP<<1;
+ else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "ref") == 0) args->exclude |= VCF_OTHER<<1;
+ else if (strcmp(type_list[i], "bnd") == 0) args->exclude |= VCF_BND<<1;
else {
fprintf(pysam_stderr, "[E::%s] unknown type\n", type_list[i]);
fprintf(pysam_stderr, "Accepted types are snps, indels, mnps, other\n");
else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF
args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));
- if ( args->n_threads ) hts_set_threads(args->out, args->n_threads);
+ if ( args->n_threads > 0)
+ hts_set_opt(args->out, HTS_OPT_THREAD_POOL, args->files->p);
// headers: hdr=full header, hsub=subset header, hnull=sites only header
if (args->sites_only){
if (args->include || args->exclude)
{
int line_type = bcf_get_variant_types(line);
- if ( args->include && !(line_type&args->include) ) return 0; // include only given variant types
- if ( args->exclude && line_type&args->exclude ) return 0; // exclude given variant types
+ if ( args->include && !((line_type<<1) & args->include) ) return 0; // include only given variant types
+ if ( args->exclude && (line_type<<1) & args->exclude ) return 0; // exclude given variant types
}
if ( args->filter )
}
}
- if (args->min_ac)
+ if (args->min_ac!=-1)
{
if (args->min_ac_type == ALLELE_NONREF && args->min_ac>non_ref_ac) return 0; // min AC
else if (args->min_ac_type == ALLELE_MINOR && args->min_ac>minor_ac) return 0; // min minor AC
else if (args->min_ac_type == ALLELE_MAJOR && args->min_ac > major_ac) return 0; // min major AC
else if (args->min_ac_type == ALLELE_NONMAJOR && args->min_ac > an-major_ac) return 0; // min non-major AC
}
- if (args->max_ac)
+ if (args->max_ac!=-1)
{
if (args->max_ac_type == ALLELE_NONREF && args->max_ac<non_ref_ac) return 0; // max AC
else if (args->max_ac_type == ALLELE_MINOR && args->max_ac<minor_ac) return 0; // max minor AC
else if (args->max_ac_type == ALLELE_MAJOR && args->max_ac < major_ac) return 0; // max major AC
else if (args->max_ac_type == ALLELE_NONMAJOR && args->max_ac < an-major_ac) return 0; // max non-major AC
}
- if (args->min_af)
+ if (args->min_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->min_af_type == ALLELE_NONREF && args->min_af>non_ref_ac/(double)an) return 0; // min AF
else if (args->min_af_type == ALLELE_MAJOR && args->min_af > major_ac/(double)an) return 0; // min major AF
else if (args->min_af_type == ALLELE_NONMAJOR && args->min_af > (an-major_ac)/(double)an) return 0; // min non-major AF
}
- if (args->max_af)
+ if (args->max_af!=-1)
{
if (an == 0) return 0; // freq not defined, skip site
if (args->max_af_type == ALLELE_NONREF && args->max_af<non_ref_ac/(double)an) return 0; // max AF
if (args->trim_alts)
{
int ret = bcf_trim_alleles(args->hsub ? args->hsub : args->hdr, line);
- if ( ret==-1 ) error("Error: some GT index is out of bounds at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
+ if ( ret<0 ) error("Error: Could not trim alleles at %s:%d\n", bcf_seqname(args->hsub ? args->hsub : args->hdr, line), line->pos+1);
}
if (args->phased) {
int phased = bcf_all_phased(args->hdr, line);
fprintf(pysam_stderr, " -R, --regions-file <file> restrict to regions listed in a file\n");
fprintf(pysam_stderr, " -t, --targets [^]<region> similar to -r but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
fprintf(pysam_stderr, " -T, --targets-file [^]<file> similar to -R but streams rather than index-jumps. Exclude regions with \"^\" prefix\n");
- fprintf(pysam_stderr, " --threads <int> number of extra output compression threads [0]\n");
+ fprintf(pysam_stderr, " --threads <int> number of extra (de)compression threads [0]\n");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "Subset options:\n");
fprintf(pysam_stderr, " -a, --trim-alt-alleles trim alternate alleles not seen in the subset\n");
fprintf(pysam_stderr, " -q/Q, --min-af/--max-af <float>[:<type>] minimum/maximum frequency for non-reference (nref), 1st alternate (alt1), least frequent\n");
fprintf(pysam_stderr, " (minor), most frequent (major) or sum of all but most frequent (nonmajor) alleles [nref]\n");
fprintf(pysam_stderr, " -u/U, --uncalled/--exclude-uncalled select/exclude sites without a called genotype\n");
- fprintf(pysam_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,other [null]\n");
+ fprintf(pysam_stderr, " -v/V, --types/--exclude-types <list> select/exclude comma-separated list of variant types: snps,indels,mnps,ref,bnd,other [null]\n");
fprintf(pysam_stderr, " -x/X, --private/--exclude-private select/exclude sites where the non-reference alleles are exclusive (private) to the subset samples\n");
fprintf(pysam_stderr, "\n");
exit(1);
args->output_type = FT_VCF;
args->n_threads = 0;
args->record_cmd_line = 1;
+ args->min_ac = args->max_ac = args->min_af = args->max_af = -1;
int targets_is_file = 0, regions_is_file = 0;
static struct option loptions[] =
error("Failed to read the targets: %s\n", args->targets_list);
}
+ if ( bcf_sr_set_threads(args->files, args->n_threads)<0 ) error("Failed to create threads\n");
if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
init_data(args);
bcf_hdr_write(args->out, out_hdr);
else if ( args->output_type & FT_BCF )
error("BCF output requires header, cannot proceed with -H\n");
+
+ int ret = 0;
if (!args->header_only)
{
while ( bcf_sr_next_line(args->files) )
if ( subset_vcf(args, line) )
bcf_write1(args->out, out_hdr, line);
}
+ ret = args->files->errnum;
+ if ( ret ) fprintf(pysam_stderr,"Error: %s\n", bcf_sr_strerror(args->files->errnum));
}
hts_close(args->out);
destroy_data(args);
bcf_sr_destroy(args->files);
free(args);
- return 0;
+ return ret;
}
-#define BCFTOOLS_VERSION "1.3.1"
+#define BCFTOOLS_VERSION "1.4.1"
exec docker run --rm -v $(pwd):/io quay.io/pypa/manylinux1_x86_64 /io/$0
fi
-yum install -y zlib-devel
+yum install -y zlib-devel bzip2-devel xz-devel
# Python 2.6 is not supported
rm -r /opt/python/cp26*
Commands available in :term:`csamtools` are available as simple
function calls. For example::
- pysam.sort("ex1.bam", "output")
+ pysam.sort("-o", "output.bam", "ex1.bam")
corresponds to the command line::
- samtools sort ex1.bam output
+ samtools sort -o output.bam ex1.bam
Analogous to :class:`~pysam.AlignmentFile`, a
:class:`~pysam.TabixFile` allows fast random access to compressed and
Release notes
=============
+Release 0.11.2.2
+================
+
+Bugfix release to address two issues:
+
+* Changes in 0.11.2.1 broke the GTF/GFF3 parser. Corrected and
+ more tests have been added.
+* [#479] Correct VariantRecord edge cases described in issue
+
+
+Release 0.11.2.1
+================
+
+Release to fix release tar-ball containing 0.11.1 pre-compiled
+C-files.
+
+
+Release 0.11.2
+==============
+
+This release wraps htslib/samtools/bcfools versions 1.4.1 in response
+to a security fix in these libraries. Additionaly the following
+issues have been fixed:
+
+* [#452] add GFF3 support for tabix parsers
+* [#461] Multiple fixes related to VariantRecordInfo and handling of INFO/END
+* [#447] limit query name to 251 characters (only partially addresses issue)
+
+VariantFile and related object fixes
+
+* Restore VariantFile.\_\_dealloc\_\_
+* Correct handling of bcf_str_missing in bcf_array_to_object and
+ bcf_object_to_array
+* Added update() and pop() methods to some dict-like proxy objects
+* scalar INFO entries could not be set again after being deleted
+* VariantRecordInfo.__delitem__ now allows unset flags to be deleted without
+ raising a KeyError
+* Multiple other fixes for VariantRecordInfo methods
+* INFO/END is now accessible only via VariantRecord.stop and
+ VariantRecord.rlen. Even if present behind the scenes, it is no longer
+ accessible via VariantRecordInfo.
+* Add argument to issue a warning instead of an exception if input appears
+ to be truncated
+
+Other features and fixes:
+
+* Make AlignmentFile \_\_dealloc\_\_ and close more
+ stringent
+* Add argument AlignmentFile to issue a warning instead of an
+ exception if input appears to be truncated
+
+Release 0.11.1
+==============
+
+Bugfix release
+
+* [#440] add deprecated 'always' option to infer_query_length for backwards compatibility.
+
+Release 0.11.0
+==============
+
+This release wraps the latest versions of htslib/samtools/bcftools and
+implements a few bugfixes.
+
+* [#413] Wrap HTSlib/Samtools/BCFtools 1.4
+* [#422] Fix missing pysam.sort.usage() message
+* [#411] Fix BGZfile initialization bug
+* [#412] Add seek support for BGZFile
+* [#395] Make BGZfile iterable
+* [#433] Correct getQueryEnd
+* [#419] Export SAM enums such as pysam.CMATCH
+* [#415] Fix access by tid in AlignmentFile.fetch()
+* [#405] Writing SAM now outputs a header by default.
+* [#332] split infer_query_length(always) into infer_query_length and infer_read_length
+
Release 0.10.0
==============
Using samtools commands within python
=====================================
-Commands available in :term:`csamtools` are available
-as simple function calls. For example::
+Commands available in :term:`csamtools` are available as simple
+function calls. Command line options are provided as arguments. For
+example::
- pysam.sort("ex1.bam", "output")
+ pysam.sort("-o", "output.bam", "ex1.bam")
corresponds to the command line::
- samtools sort ex1.bam output
+ samtools sort -o output.bam ex1.bam
-Command line options can be provided as arguments::
-
- pysam.sort("-n", "ex1.bam", "output")
-
-or::
+Or for example::
- pysam.sort("-m", "1000000", "ex1.bam", "output")
+ pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam")
In order to get usage information, try::
- print pysam.sort.usage()
+ print(pysam.sort.usage())
Argument errors raise a :class:`pysam.SamtoolsError`::
EXCLUDE = {
"samtools": (
- "razip.c", "bgzip.c", "main.c",
- "calDepth.c", "bam2bed.c", "wgsim.c",
- "md5fa.c", "md5sum-lite.c", "maq2sam.c",
- "bamcheck.c", "chk_indel.c", "vcf-miniview.c",
+ "razip.c",
+ "bgzip.c",
+ "main.c",
+ "calDepth.c",
+ "bam2bed.c",
+ "wgsim.c",
+ "bam_tview.c",
+ "bam_tview.h",
+ "bam_tview_html.c",
+ "bam_tview_curses.c",
+ "md5fa.c",
+ "md5sum-lite.c",
+ "maq2sam.c",
+ "bamcheck.c",
+ "chk_indel.c",
+ "vcf-miniview.c",
"htslib-1.3", # do not import twice
"hfile_irods.c", # requires irods library
),
if not filename:
continue
dest = filename + ".pysam.c"
- with open(filename) as infile:
+ with open(filename, encoding="utf-8") as infile:
lines = "".join(infile.readlines())
- with open(dest, "w") as outfile:
+
+ with open(dest, "w", encoding="utf-8") as outfile:
outfile.write('#include "pysam.h"\n\n')
subname, _ = os.path.splitext(os.path.basename(filename))
if subname in MAIN.get(basename, []):
old_file = os.path.join(targetdir, f)
if os.path.exists(old_file):
md5_old = hashlib.md5(
- "".join(open(old_file, "r").readlines())).digest()
+ "".join(open(old_file, "r", encoding="utf-8").readlines()).encode()).digest()
md5_new = hashlib.md5(
- "".join(open(src, "r").readlines())).digest()
+ "".join(open(src, "r", encoding="utf-8").readlines()).encode()).digest()
if md5_old != md5_new:
raise ValueError(
"incompatible files for %s and %s" %
import sysconfig
from pysam.libchtslib import *
+from pysam.libcsamtools import *
+from pysam.libcbcftools import *
from pysam.libcutils import *
import pysam.libcutils as libcutils
import pysam.libcfaidx as libcfaidx
--- /dev/null
+#ifndef CBCFTOOLS_UTIL_H
+#define CBCFTOOLS_UTIL_H
+
+int bcftools_main(int argc, char *argv[]);
+
+#endif
--- /dev/null
+#ifndef CSAMTOOLS_UTIL_H
+#define CSAMTOOLS_UTIL_H
+
+int samtools_main(int argc, char *argv[]);
+
+#endif
static inline char pysam_bam_seqi(uint8_t * s, int i) {
return bam_seqi(s,i);}
-// Wrapping bit field access in bam1_core_t
-// bit fields not supported in cython and due
-// to endian-ness it is not clear which part
-// of the bit-field is in the higher or lower bytes.
-static inline uint16_t pysam_get_bin(bam1_t * b) {
- return b->core.bin;}
-
static inline uint8_t pysam_get_qual(bam1_t * b) {
return b->core.qual;}
-static inline uint8_t pysam_get_l_qname(bam1_t * b) {
- return b->core.l_qname;}
-
-static inline uint16_t pysam_get_flag(bam1_t * b) {
- return b->core.flag;}
static inline uint16_t pysam_get_n_cigar(bam1_t * b) {
return b->core.n_cigar;}
-static inline void pysam_set_bin(bam1_t * b, uint16_t v) {
- b->core.bin=v;}
-
static inline void pysam_set_qual(bam1_t * b, uint8_t v) {
b->core.qual=v;}
-static inline void pysam_set_l_qname(bam1_t * b, uint8_t v) {
- b->core.l_qname=v;}
-
-static inline void pysam_set_flag(bam1_t * b, uint16_t v) {
- b->core.flag=v;}
static inline void pysam_set_n_cigar(bam1_t * b, uint16_t v) {
b->core.n_cigar=v;}
int pysam_bam_get_l_aux(bam1_t * b)
char pysam_bam_seqi(uint8_t * s, int i)
- uint16_t pysam_get_bin(bam1_t * b)
uint8_t pysam_get_qual(bam1_t * b)
- uint8_t pysam_get_l_qname(bam1_t * b)
- uint16_t pysam_get_flag(bam1_t * b)
uint16_t pysam_get_n_cigar(bam1_t * b)
- void pysam_set_bin(bam1_t * b, uint16_t v)
void pysam_set_qual(bam1_t * b, uint8_t v)
- void pysam_set_l_qname(bam1_t * b, uint8_t v)
- void pysam_set_flag(bam1_t * b, uint16_t v)
void pysam_set_n_cigar(bam1_t * b, uint16_t v)
void pysam_update_flag(bam1_t * b, uint16_t v, uint16_t flag)
cimport cython
from cpython cimport array as c_array
from cpython.version cimport PY_MAJOR_VERSION
-from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize
+from cpython cimport PyBytes_FromStringAndSize
from libc.string cimport strchr
from cpython cimport array as c_array
len(value)] + list(value))
elif isinstance(value, array.array):
+ valuetype = value.typecode
+ if valuetype not in datatype2format:
+ valuetype = None
# binary tags from arrays
if valuetype is None:
array_typecode = map_typecode_python_to_htslib(ord(value.typecode))
return "".join(fmts), args
-cdef inline int32_t calculateQueryLength(bam1_t * src):
+cdef inline int32_t calculateQueryLengthWithoutHardClipping(bam1_t * src):
"""return query length computed from CIGAR alignment.
+ Length ignores hard-clipped bases.
+
+ Return 0 if there is no CIGAR alignment.
+ """
+
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
+
+ if cigar_p == NULL:
+ return 0
+
+ cdef uint32_t k, qpos
+ cdef int op
+ qpos = 0
+
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+
+ if op == BAM_CMATCH or \
+ op == BAM_CINS or \
+ op == BAM_CSOFT_CLIP or \
+ op == BAM_CEQUAL or \
+ op == BAM_CDIFF:
+ qpos += cigar_p[k] >> BAM_CIGAR_SHIFT
+
+ return qpos
+
+
+cdef inline int32_t calculateQueryLengthWithHardClipping(bam1_t * src):
+ """return query length computed from CIGAR alignment.
+
+ Length includes hard-clipped bases.
+
Return 0 if there is no CIGAR alignment.
"""
cdef inline int32_t getQueryStart(bam1_t *src) except -1:
cdef uint32_t * cigar_p
- cdef uint32_t k, op
cdef uint32_t start_offset = 0
+ cdef uint32_t k, op
- if pysam_get_n_cigar(src):
- cigar_p = pysam_bam_get_cigar(src);
- for k from 0 <= k < pysam_get_n_cigar(src):
- op = cigar_p[k] & BAM_CIGAR_MASK
- if op == BAM_CHARD_CLIP:
- if start_offset != 0 and start_offset != src.core.l_qseq:
- PyErr_SetString(ValueError, 'Invalid clipping in CIGAR string')
- return -1
- elif op == BAM_CSOFT_CLIP:
- start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
- else:
- break
+ cigar_p = pysam_bam_get_cigar(src);
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CHARD_CLIP:
+ if start_offset != 0 and start_offset != src.core.l_qseq:
+ raise ValueError('Invalid clipping in CIGAR string')
+ elif op == BAM_CSOFT_CLIP:
+ start_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ break
return start_offset
cdef inline int32_t getQueryEnd(bam1_t *src) except -1:
- cdef uint32_t * cigar_p
- cdef uint32_t k, op
+ cdef uint32_t * cigar_p = pysam_bam_get_cigar(src)
cdef uint32_t end_offset = src.core.l_qseq
+ cdef uint32_t k, op
# if there is no sequence, compute length from cigar string
if end_offset == 0:
- end_offset = calculateQueryLength(src)
-
- # walk backwards in cigar string
- if pysam_get_n_cigar(src) > 1:
- cigar_p = pysam_bam_get_cigar(src);
+ for k from 0 <= k < pysam_get_n_cigar(src):
+ op = cigar_p[k] & BAM_CIGAR_MASK
+ if op == BAM_CMATCH or \
+ op == BAM_CINS or \
+ op == BAM_CEQUAL or \
+ op == BAM_CDIFF or \
+ (op == BAM_CSOFT_CLIP and end_offset == 0):
+ end_offset += cigar_p[k] >> BAM_CIGAR_SHIFT
+ else:
+ # walk backwards in cigar string
for k from pysam_get_n_cigar(src) > k >= 1:
op = cigar_p[k] & BAM_CIGAR_MASK
if op == BAM_CHARD_CLIP:
- if end_offset != 0 and end_offset != src.core.l_qseq:
- PyErr_SetString(ValueError,
- 'Invalid clipping in CIGAR string')
- return -1
+ if end_offset != src.core.l_qseq:
+ raise ValueError('Invalid clipping in CIGAR string')
elif op == BAM_CSOFT_CLIP:
end_offset -= cigar_p[k] >> BAM_CIGAR_SHIFT
else:
if t == o:
return 0
+ cdef uint8_t *a = <uint8_t*>&t.core
+ cdef uint8_t *b = <uint8_t*>&o.core
+
retval = memcmp(&t.core, &o.core, sizeof(bam1_core_t))
-
if retval:
return retval
+
# cmp(t.l_data, o.l_data)
retval = (t.l_data > o.l_data) - (t.l_data < o.l_data)
if retval:
property query_name:
"""the query template name (None if not present)"""
def __get__(self):
- cdef bam1_t * src
- src = self._delegate
- if pysam_get_l_qname(src) == 0:
+
+ cdef bam1_t * src = self._delegate
+ if src.core.l_qname == 0:
return None
+
return charptr_to_str(<char *>pysam_bam_get_qname(src))
def __set__(self, qname):
+
if qname is None or len(qname) == 0:
return
- if len(qname) >= 255:
- raise ValueError("query length out of range {} > 254".format(
+ # See issue #447
+ # (The threshold is 252 chars, but this includes a \0 byte.
+ if len(qname) > 251:
+ raise ValueError("query length out of range {} > 251".format(
len(qname)))
qname = force_bytes(qname)
- cdef bam1_t * src
- cdef int l
- cdef char * p
+ cdef bam1_t * src = self._delegate
+ # the qname is \0 terminated
+ cdef uint8_t l = len(qname) + 1
- src = self._delegate
- p = pysam_bam_get_qname(src)
+ cdef char * p = pysam_bam_get_qname(src)
+ cdef uint8_t l_extranul = 0
+
+ if l % 4 != 0:
+ l_extranul = 4 - l % 4
- # the qname is \0 terminated
- l = len(qname) + 1
pysam_bam_update(src,
- pysam_get_l_qname(src),
- l,
+ src.core.l_qname,
+ l + l_extranul,
<uint8_t*>p)
- pysam_set_l_qname(src, l)
-
+ src.core.l_extranul = l_extranul
+ src.core.l_qname = l + l_extranul
+
# re-acquire pointer to location in memory
# as it might have moved
p = pysam_bam_get_qname(src)
strncpy(p, qname, l)
+ # x might be > 255
+ cdef uint16_t x = 0
+
+ for x from l <= x < l + l_extranul:
+ p[x] = '\0'
property flag:
"""properties flag"""
def __get__(self):
- return pysam_get_flag(self._delegate)
+ return self._delegate.core.flag
def __set__(self, flag):
- pysam_set_flag(self._delegate, flag)
+ self._delegate.core.flag = flag
property reference_name:
""":term:`reference` name (None if no AlignmentFile is associated)"""
src = self._delegate
src.core.pos = pos
if pysam_get_n_cigar(src):
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- bam_endpos(src),
- 14,
- 5))
+ src.core.bin = hts_reg2bin(
+ src.core.pos,
+ bam_endpos(src),
+ 14,
+ 5)
else:
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- src.core.pos + 1,
- 14,
- 5))
+ src.core.bin = hts_reg2bin(
+ src.core.pos,
+ src.core.pos + 1,
+ 14,
+ 5)
property mapping_quality:
"""mapping quality"""
property bin:
"""properties bin"""
def __get__(self):
- return pysam_get_bin(self._delegate)
+ return self._delegate.core.bin
def __set__(self, bin):
- pysam_set_bin(self._delegate, bin)
+ self._delegate.core.bin = bin
##########################################################
This the index of the first base in :attr:`seq` that is not
soft-clipped.
-
"""
def __get__(self):
return getQueryStart(self._delegate)
property query_alignment_end:
"""end index of the aligned query portion of the sequence (0-based,
- exclusive)"""
+ exclusive)
+
+ This the index just past the last base in :attr:`seq` that is not
+ soft-clipped.
+ """
def __get__(self):
return getQueryEnd(self._delegate)
return result
- def infer_query_length(self, always=True):
- """inferred read length from CIGAR string.
+ def infer_query_length(self, always=False):
+ """infer query length from sequence or CIGAR alignment.
- If *always* is set to True, the read length
- will be always inferred. If set to False, the length
- of the read sequence will be returned if it is
- available.
+ This method deduces the query length from the CIGAR alignment
+ but does not include hard-clipped bases.
- Returns None if CIGAR string is not present.
- """
+ Returns None if CIGAR alignment is not present.
- cdef uint32_t * cigar_p
- cdef bam1_t * src
+ If *always* is set to True, `infer_read_length` is used instead.
+ This is deprecated and only present for backward compatibility.
+ """
+ if always is True:
+ return self.infer_read_length()
+ return calculateQueryLengthWithoutHardClipping(self._delegate)
- src = self._delegate
+ def infer_read_length(self):
+ """infer read length from CIGAR alignment.
- if not always and src.core.l_qseq:
- return src.core.l_qseq
+ This method deduces the read length from the CIGAR alignment
+ including hard-clipped bases.
- return calculateQueryLength(src)
+ Returns None if CIGAR alignment is not present.
+ """
+ return calculateQueryLengthWithHardClipping(self._delegate)
def get_reference_sequence(self):
"""return the reference sequence.
+-----+--------------+-----+
|X |BAM_CDIFF |8 |
+-----+--------------+-----+
- |NM |NM tag |9 |
+ |B |BAM_CBACK |9 |
+ +-----+--------------+-----+
+ |NM |NM tag |10 |
+-----+--------------+-----+
If no cigar string is present, empty arrays will be returned.
+-----+--------------+-----+
|X |BAM_CDIFF |8 |
+-----+--------------+-----+
+ |B |BAM_CBACK |9 |
+ +-----+--------------+-----+
.. note::
The output is a list of (operation, length) tuples, such as
k += 1
## setting the cigar string requires updating the bin
- pysam_set_bin(src,
- hts_reg2bin(
- src.core.pos,
- bam_endpos(src),
- 14,
- 5))
+ src.core.bin = hts_reg2bin(
+ src.core.pos,
+ bam_endpos(src),
+ 14,
+ 5)
cpdef set_tag(self,
def __get__(self):
return self._is_refskip
+
+cpdef enum CIGAR_OPS:
+ CMATCH = 0
+ CINS = 1
+ CDEL = 2
+ CREF_SKIP = 3
+ CSOFT_CLIP = 4
+ CHARD_CLIP = 5
+ CPAD = 6
+ CEQUAL = 7
+ CDIFF = 8
+ CBACK = 9
+
+
+cpdef enum SAM_FLAGS:
+ # the read is paired in sequencing, no matter whether it is mapped in a pair
+ FPAIRED = 1
+ # the read is mapped in a proper pair
+ FPROPER_PAIR = 2
+ # the read itself is unmapped; conflictive with FPROPER_PAIR
+ FUNMAP = 4
+ # the mate is unmapped
+ FMUNMAP = 8
+ # the read is mapped to the reverse strand
+ FREVERSE = 16
+ # the mate is mapped to the reverse strand
+ FMREVERSE = 32
+ # this is read1
+ FREAD1 = 64
+ # this is read2
+ FREAD2 = 128
+ # not primary alignment
+ FSECONDARY = 256
+ # QC failure
+ FQCFAIL = 512
+ # optical or PCR duplicate
+ FDUP = 1024
+ # supplementary alignment
+ FSUPPLEMENTARY = 2048
+
+
__all__ = [
"AlignedSegment",
"PileupColumn",
- "PileupRead"]
+ "PileupRead",
+ "CMATCH",
+ "CINS",
+ "CDEL",
+ "CREF_SKIP",
+ "CSOFT_CLIP",
+ "CHARD_CLIP",
+ "CPAD",
+ "CEQUAL",
+ "CDIFF",
+ "CBACK",
+ "FPAIRED",
+ "FPROPER_PAIR",
+ "FUNMAP",
+ "FMUNMAP",
+ "FREVERSE",
+ "FMREVERSE",
+ "FREAD1",
+ "FREAD2",
+ "FSECONDARY",
+ "FQCFAIL",
+ "FDUP",
+ "FSUPPLEMENTARY"]
+
# The principal classes defined in this module are:
#
# class AlignmentFile read/write access to SAM/BAM/CRAM formatted files
-#
+#
# class IndexedReads index a SAM/BAM/CRAM file by query name while keeping
# the original sort order intact
-#
+#
# Additionally this module defines numerous additional classes that
# are part of the internal API. These are:
-#
+#
# Various iterator classes to iterate over alignments in sequential
# (IteratorRow) or in a stacked fashion (IteratorColumn):
-#
+#
# class IteratorRow
# class IteratorRowRegion
# class IteratorRowHead
cimport cython
########################################################
-## Constants and global variables
-
-# defines imported from samtools
-DEF SEEK_SET = 0
-DEF SEEK_CUR = 1
-DEF SEEK_END = 2
-
+## global variables
# maximum genomic coordinace
-cdef int MAX_POS = 2 << 29
+cdef int MAX_POS = 2 << 29
# valid types for SAM headers
VALID_HEADER_TYPES = {"HD" : dict,
# default type conversions within SAM header records
KNOWN_HEADER_FIELDS = {"HD" : {"VN" : str, "SO" : str, "GO" : str},
- "SQ" : {"SN" : str, "LN" : int, "AS" : str,
+ "SQ" : {"SN" : str, "LN" : int, "AS" : str,
"M5" : str, "SP" : str, "UR" : str,
"AH" : str,},
"RG" : {"ID" : str, "CN" : str, "DS" : str,
"LB" : str, "PG" : str, "PI" : str,
"PL" : str, "PM" : str, "PU" : str,
"SM" : str,},
- "PG" : {"ID" : str, "PN" : str, "CL" : str,
+ "PG" : {"ID" : str, "PN" : str, "CL" : str,
"PP" : str, "DS" : str, "VN" : str,},}
# output order of fields within records. Ensure that CL is at
return "\t".join(line)
-cdef bam_hdr_t * build_header(new_header):
+cdef bam_hdr_t * build_header_from_dict(new_header):
'''return a new header built from a dictionary in `new_header`.
This method inserts the text field, target_name and target_len.
'''
-
- lines = []
-
- # check if hash exists
+ cdef list lines = []
# create new header and copy old data
- cdef bam_hdr_t * dest
-
- dest = bam_hdr_init()
+ cdef bam_hdr_t * dest = bam_hdr_init()
# first: defined tags
for record in VALID_HEADERS:
return dest
+cdef bam_hdr_t * build_header_from_list(reference_names,
+ reference_lengths,
+ add_sq_text=True,
+ text=None):
+
+ assert len(reference_names) == len(reference_lengths), \
+ "unequal names and lengths of reference sequences"
+
+ cdef bam_hdr_t * dest = bam_hdr_init()
+
+ # allocate and fill header
+ reference_names = [force_bytes(ref) for ref in reference_names]
+ dest.n_targets = len(reference_names)
+ n = 0
+ for x in reference_names:
+ n += len(x) + 1
+ dest.target_name = <char**>calloc(n, sizeof(char*))
+ dest.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
+ for x from 0 <= x < dest.n_targets:
+ dest.target_len[x] = reference_lengths[x]
+ name = reference_names[x]
+ dest.target_name[x] = <char*>calloc(
+ len(name) + 1, sizeof(char))
+ strncpy(dest.target_name[x], name, len(name))
+
+ # Optionally, if there is no text, add a SAM
+ # compatible header to output file.
+ if text is None and add_sq_text:
+ text = []
+ for x from 0 <= x < dest.n_targets:
+ text.append("@SQ\tSN:%s\tLN:%s\n" % \
+ (force_str(reference_names[x]),
+ reference_lengths[x]))
+ text = ''.join(text)
+
+ cdef char * ctext = NULL
+
+ if text is not None:
+ # copy without \0
+ text = force_bytes(text)
+ ctext = text
+ dest.l_text = strlen(ctext)
+ dest.text = <char*>calloc(
+ strlen(ctext), sizeof(char))
+ memcpy(dest.text, ctext, strlen(ctext))
+
+ return dest
+
+
cdef class AlignmentFile(HTSFile):
"""AlignmentFile(filepath_or_object, mode=None, template=None,
reference_names=None, reference_lengths=None, text=NULL,
header=None, add_sq_text=False, check_header=True, check_sq=True,
- reference_filename=None, filename=None, duplicate_filehandle=True)
+ reference_filename=None, filename=None, duplicate_filehandle=True,
+ ignore_truncation=False)
- A :term:`SAM`/:term:`BAM` formatted file.
+ A :term:`SAM`/:term:`BAM`/:term:`CRAM` formatted file.
If `filepath_or_object` is a string, the file is automatically
opened. If `filepath_or_object` is a python File object, the
:class:`~pysam.AlignmentFile`).
2. If `header` is given, the header is built from a
- multi-level dictionary.
+ multi-level dictionary.
3. If `text` is given, new header text is copied from raw
text.
when writing, use the string provided as the header
reference_names : list
- see referece_lengths
+ see reference_lengths
reference_lengths : list
- when writing, build header from list of chromosome names and
- lengths. By default, 'SQ' and 'LN' tags will be added to the
- header text. This option can be changed by unsetting the flag
- `add_sq_text`.
+ when writing or opening a SAM file without header build header
+ from list of chromosome names and lengths. By default, 'SQ'
+ and 'LN' tags will be added to the header text. This option
+ can be changed by unsetting the flag `add_sq_text`.
add_sq_text : bool
do not add 'SQ' and 'LN' tags to header. This option permits
construction :term:`SAM` formatted files without a header.
+ add_sam_header : bool
+ when outputting SAM the default is to output a header. This is
+ equivalent to opening the file in 'wh' mode. If this option is
+ set to False, no header will be output. To read such a file,
+ set `check_header=False`.
+
check_header : bool
- when reading, check if header is present (default=True)
+ obsolete: when reading a SAM file, check if header is present
+ (default=True)
check_sq : bool
when reading, check if SQ entries are present in header
Alternative to filepath_or_object. Filename of the file
to be opened.
- duplicate_filehandle: bool
+ duplicate_filehandle: bool
By default, file handles passed either directly or through
File-like objects will be duplicated before passing them to
htslib. The duplication prevents issues where the same stream
high-level python object. Set to False to turn off
duplication.
+ ignore_truncation: bool
+ Issue a warning, instead of raising an error if the current file
+ appears to be truncated due to a missing EOF marker. Only applies
+ to bgzipped formats. (Default=False)
"""
def __cinit__(self, *args, **kwargs):
header=None,
port=None,
add_sq_text=True,
+ add_sam_header=True,
check_header=True,
check_sq=True,
filepath_index=None,
referencenames=None,
referencelengths=None,
- duplicate_filehandle=True):
+ duplicate_filehandle=True,
+ ignore_truncation=False):
'''open a sam, bam or cram formatted file.
If _open is called on an existing file, the current file
will be closed and a new file will be opened.
+
'''
cdef char *cfilename = NULL
cdef char *creference_filename = NULL
if mode is None:
mode = "r"
+ if add_sam_header and mode == "w":
+ mode = "wh"
+
assert mode in ("r", "w", "rb", "wb", "wh",
"wbu", "rU", "wb0",
"rc", "wc"), \
self.reference_filename = reference_filename = encode_filename(
reference_filename)
- cdef char * ctext
- cdef hFILE * fp
- ctext = NULL
-
if mode[0] == 'w':
# open file for writing
if template:
self.header = bam_hdr_dup(template.header)
elif header:
- self.header = build_header(header)
+ self.header = build_header_from_dict(header)
else:
- # build header from a target names and lengths
assert reference_names and reference_lengths, \
("either supply options `template`, `header` "
"or both `reference_names` and `reference_lengths` "
"for writing")
- assert len(reference_names) == len(reference_lengths), \
- "unequal names and lengths of reference sequences"
-
- # allocate and fill header
- reference_names = [force_bytes(ref) for ref in reference_names]
- self.header = bam_hdr_init()
- self.header.n_targets = len(reference_names)
- n = 0
- for x in reference_names:
- n += len(x) + 1
- self.header.target_name = <char**>calloc(n, sizeof(char*))
- self.header.target_len = <uint32_t*>calloc(n, sizeof(uint32_t))
- for x from 0 <= x < self.header.n_targets:
- self.header.target_len[x] = reference_lengths[x]
- name = reference_names[x]
- self.header.target_name[x] = <char*>calloc(
- len(name) + 1, sizeof(char))
- strncpy(self.header.target_name[x], name, len(name))
-
- # Optionally, if there is no text, add a SAM
- # compatible header to output file.
- if text is None and add_sq_text:
- text = []
- for x from 0 <= x < self.header.n_targets:
- text.append("@SQ\tSN:%s\tLN:%s\n" % \
- (force_str(reference_names[x]),
- reference_lengths[x]))
- text = ''.join(text)
-
- if text is not None:
- # copy without \0
- text = force_bytes(text)
- ctext = text
- self.header.l_text = strlen(ctext)
- self.header.text = <char*>calloc(
- strlen(ctext), sizeof(char))
- memcpy(self.header.text, ctext, strlen(ctext))
+ # build header from a target names and lengths
+ self.header = build_header_from_list(
+ reference_names,
+ reference_lengths,
+ add_sq_text=add_sq_text,
+ text=text)
self.htsfile = self._open_htsfile()
# open file for reading
if not self._exists():
raise IOError("file `%s` not found" % self.filename)
-
+
self.htsfile = self._open_htsfile()
if self.htsfile == NULL:
if self.htsfile.format.category != sequence_data:
raise ValueError("file does not contain alignment data")
+ self.check_truncation(ignore_truncation)
+
# bam files require a valid header
if self.is_bam or self.is_cram:
with nogil:
"file does not have valid header (mode='%s') "
"- is it BAM format?" % mode )
else:
- # in sam files it is optional (htsfile full of
- # unmapped reads)
- if check_header:
+ # in sam files a header is optional, but requires
+ # reference names and lengths
+ if reference_names and reference_lengths:
+ self.header = build_header_from_list(
+ reference_names,
+ reference_lengths,
+ add_sq_text=add_sq_text,
+ text=text)
+ else:
with nogil:
self.header = sam_hdr_read(self.htsfile)
if self.header == NULL:
raise ValueError(
- "file does not have valid header (mode='%s') "
- "- is it SAM format?" % mode )
- # self.header.ignore_sam_err = True
+ "file does not have valid header (mode='%s'), "
+ "please provide reference_names and reference_lengths")
# set filename with reference sequences
if self.is_cram and reference_filename:
if not self.is_open:
raise ValueError("I/O operation on closed file")
if not 0 <= tid < self.header.n_targets:
- raise ValueError("reference_id %i out of range 0<=tid<%i" %
+ raise ValueError("reference_id %i out of range 0<=tid<%i" %
(tid, self.header.n_targets))
return charptr_to_str(self.header.target_name[tid])
Alternatively, a samtools :term:`region` string can be
supplied.
-
+
If any of the coordinates are missing they will be replaced by the
minimum (`start`) or maximum (`end`) coordinate.
Returns
-------
-
+
tuple : a tuple of `flag`, :term:`tid`, `start` and `end`. The
flag indicates whether no coordinates were supplied and the
genomic region is the complete genomic space.
Raises
------
-
+
ValueError
for invalid or out of bounds regions.
cdef long long rstart
cdef long long rend
+ if reference is None and tid is None and region is None:
+ return 0, 0, 0, 0
+
rtid = -1
rstart = 0
rend = MAX_POS
if len(parts) >= 3:
rend = int(parts[2])
- if not reference:
- return 0, 0, 0, 0
-
if tid is not None:
rtid = tid
+ if rtid < 0 or rtid >= self.header.n_targets:
+ raise IndexError("invalid reference, {} out of range 0-{}".format(
+ rtid, self.header.n_targets))
else:
rtid = self.gettid(reference)
tid=None,
until_eof=False,
multiple_iterators=False):
- """fetch reads aligned in a :term:`region`.
+ """fetch reads aligned in a :term:`region`.
See :meth:`AlignmentFile.parse_region` for more information
on genomic regions.
Parameters
----------
-
+
until_eof : bool
If `until_eof` is True, all reads from the current file
file. Using this option will also fetch unmapped reads.
multiple_iterators : bool
-
+
If `multiple_iterators` is True, multiple
iterators on the same file can be used at the same time. The
iterator returned will receive its own copy of a filehandle to
if has_coord:
return IteratorRowRegion(
- self, rtid, rstart, rend,
+ self, rtid, rstart, rend,
multiple_iterators=multiple_iterators)
else:
if until_eof:
else:
if has_coord:
raise ValueError(
- "fetching by region is not available for sam files")
+ "fetching by region is not available for SAM files")
- if self.header == NULL:
+ if multiple_iterators == True:
raise ValueError(
- "fetch called for htsfile without header")
+ "multiple iterators not implemented for SAM files")
- # check if targets are defined
- # give warning, sam_read1 segfaults
- if self.header.n_targets == 0:
- warnings.warn("fetch called for htsfile without header")
-
return IteratorRowAll(self,
multiple_iterators=multiple_iterators)
def head(self, n, multiple_iterators=True):
- '''return an iterator over the first n alignments.
+ '''return an iterator over the first n alignments.
This iterator is is useful for inspecting the bam-file.
----------
multiple_iterators : bool
-
+
is set to True by default in order to
avoid changing the current file position.
-
+
Returns
-------
-
+
an iterator over a collection of reads
-
+
'''
return IteratorRowHead(self, n,
multiple_iterators=multiple_iterators)
not re-opened the file.
.. note::
-
+
This method is too slow for high-throughput processing.
If a read needs to be processed with its mate, work
from a read name sorted file or, better, cache reads.
Returns
-------
-
+
:class:`~pysam.AlignedSegment` : the mate
Raises
Parameters
----------
-
+
reference : string
reference_name of the genomic region (chromosome)
end : int
end of the genomic region
-
+
region : string
a region string in samtools format.
until_eof : bool
- count until the end of the file, possibly including
+ count until the end of the file, possibly including
unmapped reads as well.
read_callback: string or function
return counter
@cython.boundscheck(False) # we do manual bounds checking
- def count_coverage(self,
+ def count_coverage(self,
reference=None,
start=None,
end=None,
Parameters
----------
-
+
reference : string
reference_name of the genomic region (chromosome)
quality_threshold : int
quality_threshold is the minimum quality score (in phred) a
- base has to reach to be counted.
+ base has to reach to be counted.
read_callback: string or function
four array.arrays of the same length in order A C G T : tuple
"""
-
+
cdef int _start = start
cdef int _stop = end
cdef int length = _stop - _start
filter_method = 1
elif read_callback == "nofilter":
filter_method = 2
-
+
cdef int _threshold = quality_threshold
for read in self.fetch(reference=reference,
start=start,
return res
def close(self):
- '''
- closes the :class:`pysam.AlignmentFile`.'''
+ '''closes the :class:`pysam.AlignmentFile`.'''
if self.htsfile == NULL:
return
cdef int ret = hts_close(self.htsfile)
- hts_idx_destroy(self.index)
self.htsfile = NULL
+ if self.index != NULL:
+ hts_idx_destroy(self.index)
+ self.index = NULL
+
+ if self.header != NULL:
+ bam_hdr_destroy(self.header)
+ self.header = NULL
+
if ret < 0:
global errno
if errno == EPIPE:
raise OSError(errno, force_str(strerror(errno)))
def __dealloc__(self):
- # remember: dealloc cannot call other methods
- # note: no doc string
- # note: __del__ is not called.
-
- # FIXME[kbj]: isn't self.close a method? I've been duplicating
- # close within __dealloc__ (see BCFFile.__dealloc__). Not a pretty
- # solution and perhaps unnecessary given that calling self.close has
- # been working for years.
- # AH: I have removed the call to close. Even though it is working,
- # it seems to be dangerous according to the documentation as the
- # object be partially deconstructed already.
cdef int ret = 0
if self.htsfile != NULL:
ret = hts_close(self.htsfile)
- hts_idx_destroy(self.index);
self.htsfile = NULL
- bam_destroy1(self.b)
+ if self.index != NULL:
+ hts_idx_destroy(self.index)
+ self.index = NULL
+
if self.header != NULL:
bam_hdr_destroy(self.header)
+ self.header = NULL
+ if self.b:
+ bam_destroy1(self.b)
+ self.b = NULL
if ret < 0:
global errno
errno = 0
else:
raise OSError(errno, force_str(strerror(errno)))
-
+
cpdef int write(self, AlignedSegment read) except -1:
'''
write a single :class:`pysam.AlignedSegment` to disk.
Returns
-------
-
+
int : the number of bytes written. If the file is closed,
this will be 0.
'''
return self.header.n_targets
property references:
- """tuple with the names of :term:`reference` sequences. This is a
+ """tuple with the names of :term:`reference` sequences. This is a
read-only attribute"""
def __get__(self):
if not self.is_open: raise ValueError( "I/O operation on closed file" )
property text:
'''string with the full contents of the :term:`sam file` header as a
- string.
+ string.
This is a read-only attribute.
-
+
See :attr:`pysam.AlignmentFile.header` to get a parsed
representation of the header.
'''
return from_string_and_size(self.header.text, self.header.l_text)
property header:
- """two-level dictionay with header information from the file.
-
+ """two-level dictionay with header information from the file.
+
This is a read-only attribute.
The first level contains the record (``HD``, ``SQ``, etc) and
the second level contains the fields (``VN``, ``LN``, etc).
-
+
The parser is validating and will raise an AssertionError if
if encounters any record or field tags that are not part of
the SAM specification. Use the
raise ValueError( "I/O operation on closed file" )
result = {}
-
+
if self.header.text != NULL:
# convert to python string (note: call self.text to
# create 0-terminated string)
x = {}
for idx, field in enumerate(fields[1:]):
- if ":" not in field:
+ if ":" not in field:
raise ValueError("malformatted header: no ':' in field" )
key, value = field.split(":", 1)
if key in ("CL",):
"can not iterate over samfile without header")
return self
- cdef bam1_t * getCurrent( self ):
+ cdef bam1_t * getCurrent(self):
return self.b
cdef int cnext(self):
raise IOError('truncated file')
else:
raise StopIteration
-
+
# Compatibility functions for pysam < 0.8.3
def gettid(self, reference):
"""deprecated, use get_tid() instead"""
return self.get_tid(reference)
-
+
def getrname(self, tid):
"""deprecated, use get_reference_name() instead"""
return self.get_reference_name(tid)
def __init__(self, AlignmentFile samfile, int multiple_iterators=False):
cdef char *cfilename
cdef char *creference_filename
-
+
if not samfile.is_open:
raise ValueError("I/O operation on closed file")
tid,
beg,
end)
-
+
def __iter__(self):
return self
def __iter__(self):
return self
- cdef bam1_t * getCurrent( self ):
+ cdef bam1_t * getCurrent(self):
return self.b
cdef int cnext(self):
def __iter__(self):
return self
- cdef bam1_t * getCurrent( self ):
+ cdef bam1_t * getCurrent(self):
return self.b
cdef int cnext(self):
the samtools pileup.
'''
- # Note that this method requries acces to some
+ # Note that this method requries acces to some
# functions in the samtools code base and is thus
# not htslib only.
# The functions accessed in samtools are:
skip = 0
# realign read - changes base qualities
- if d.seq != NULL and is_cns and not is_nobaq:
- bam_prob_realn(b, d.seq)
+ if d.seq != NULL and is_cns and not is_nobaq:
+ # flag:
+ # apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
+ sam_prob_realn(b, d.seq, d.seq_len, 0)
if d.seq != NULL and capQ_thres > 10:
- q = bam_cap_mapQ(b, d.seq, capQ_thres)
+ q = sam_cap_mapq(b, d.seq, d.seq_len, capQ_thres)
if q < 0:
skip = 1
elif b.core.qual > q:
Valid values are None, "all" (default), "nofilter" or "samtools".
See AlignmentFile.pileup for description.
-
+
fastafile
A :class:`~pysam.FastaFile` object
if self.plp == NULL:
raise StopIteration
-
+
if self.truncate:
if self.start > self.pos: continue
if self.pos >= self.end: raise StopIteration
self.pos,
self.n_plp,
self.samfile)
-
+
# otherwise, proceed to next reference or stop
self.tid += 1
if self.tid < self.samfile.nreferences:
Raises
------
-
+
KeyError
if the `query_name` is not in the index.
cdef class VariantHeader(object):
cdef bcf_hdr_t *ptr
- cpdef VariantRecord new_record(self)
cdef _subset_samples(self, include_samples)
cdef class VariantHeaderRecord(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef bcf_hrec_t *ptr
cdef class VariantHeaderRecords(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef class VariantHeaderContigs(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef class VariantHeaderSamples(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef class VariantContig(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef int id
cdef class VariantMetadata(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef int type
cdef int id
cdef class VariantHeaderMetadata(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef int32_t type
cdef class VariantRecord(object):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef bcf1_t *ptr
cdef class BCFIndex(BaseIndex):
- cdef VariantHeader header
+ cdef readonly VariantHeader header
cdef hts_idx_t *ptr
cdef readonly bint is_reading # true if file has begun reading records
cdef readonly bint header_written # true if header has already been written
- cpdef VariantRecord new_record(self)
-
cpdef int write(self, VariantRecord record) except -1
from pysam.libchtslib cimport HTSFile, hisremote
-from warnings import warn
-
-
__all__ = ['VariantFile',
'VariantHeader',
'VariantHeaderRecord',
from pysam.libcutils cimport encode_filename, from_string_and_size
+########################################################################
+########################################################################
+## Sentinel object
+########################################################################
+
+cdef object _nothing = object()
+
########################################################################
########################################################################
## VCF/BCF string intern system
return val
+########################################################################
+########################################################################
+## Genotype math
+########################################################################
+
+cdef int comb(int n, int k) except -1:
+ """Return binomial coeffient: n choose k
+
+ >>> comb(5, 1)
+ 5
+ >>> comb(5, 2)
+ 10
+ >>> comb(2, 2)
+ 1
+ >>> comb(100, 2)
+ 4950
+ """
+ if k > n:
+ return 0
+ elif k == n:
+ return 1
+ elif k > n // 2:
+ k = n - k
+
+ cdef d, result
+
+ d = result = n - k + 1
+ for i in range(2, k + 1):
+ d += 1
+ result *= d
+ result //= i
+ return result
+
+
+cdef inline int bcf_geno_combinations(int ploidy, int alleles) except -1:
+ """Return the count of genotypes expected for the given ploidy and number of alleles.
+
+ >>> bcf_geno_combinations(1, 2)
+ 2
+ >>> bcf_geno_combinations(2, 2)
+ 3
+ >>> bcf_geno_combinations(2, 3)
+ 6
+ >>> bcf_geno_combinations(3, 2)
+ 4
+ """
+ return comb(alleles + ploidy - 1, ploidy)
+
+
########################################################################
########################################################################
## Low level type conversion helpers
cdef inline int is_gt_fmt(bcf_hdr_t *hdr, int fmt_id):
- return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), "GT") == 0
+ return strcmp(bcf_hdr_int2id(hdr, BCF_DT_ID, fmt_id), 'GT') == 0
+
+
+cdef inline int bcf_genotype_count(bcf_hdr_t *hdr, bcf1_t *rec, int sample) except -1:
+ if sample < 0:
+ raise ValueError('genotype is only valid as a format field')
+
+ cdef int32_t *gt_arr = NULL
+ cdef int ngt = 0
+ ngt = bcf_get_genotypes(hdr, rec, >_arr, &ngt)
+
+ if ngt <= 0 or not gt_arr:
+ return 0
+
+ assert ngt % rec.n_sample == 0
+ cdef int max_ploidy = ngt // rec.n_sample
+ cdef int32_t *gt = gt_arr + sample * max_ploidy
+ cdef int ploidy = 0
+
+ while ploidy < max_ploidy and gt[0] != bcf_int32_vector_end:
+ gt += 1
+ ploidy += 1
+
+ free(<void*>gt_arr)
+
+ return bcf_geno_combinations(ploidy, rec.n_allele)
cdef tuple char_array_to_tuple(const char **a, ssize_t n, int free_after=0):
cdef int32_t *data32
cdef float *dataf
cdef int i
+ cdef bytes b
if not data or n <= 0:
return None
if type == BCF_BT_CHAR:
datac = <char *>data
- while n and datac[n-1] == bcf_str_vector_end:
- n -= 1
- value = charptr_to_str_w_len(datac, n) if datac[0] != bcf_str_missing else None
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
- value = tuple(v or None for v in value.split(',')) if value else ()
- # FIXME: Need to know length? Report errors? Pad with missing values? Not clear what to do.
+ if not n:
+ value = ()
+ else:
+ # Check if at least one null terminator is present
+ if datac[n-1] == bcf_str_vector_end:
+ # If so, create a string up to the first null terminator
+ b = datac
+ else:
+ # Otherwise, copy the entire block
+ b = datac[:n]
+ value = tuple(v.decode('ascii') if v and v != bcf_str_missing else None for v in b.split(b','))
else:
value = []
if type == BCF_BT_INT8:
cdef float *dataf
cdef ssize_t i, value_count = len(values)
- assert(value_count <= n)
+ assert value_count <= n
if bt_type == BCF_BT_CHAR:
if not isinstance(values, (str, bytes)):
- values = b','.join(force_bytes(v) if v is not None else b'' for v in values)
+ values = b','.join(force_bytes(v) if v else bcf_str_missing for v in values)
value_count = len(values)
- assert(value_count <= n)
+ assert value_count <= n
datac = <char *>data
memcpy(datac, <char *>values, value_count)
for i in range(value_count, n):
raise TypeError('unsupported types')
-cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar):
+cdef bcf_get_value_count(VariantRecord record, int hl_type, int id, ssize_t *count, int *scalar, int sample):
if record is None:
raise ValueError('record must not be None')
elif length == BCF_VL_A:
count[0] = r.n_allele - 1
elif length == BCF_VL_G:
- count[0] = r.n_allele * (r.n_allele + 1) // 2
+ count[0] = bcf_genotype_count(hdr, r, sample)
elif length == BCF_VL_VAR:
count[0] = -1
else:
cdef ssize_t count
cdef int scalar
- bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar)
+ bcf_get_value_count(record, BCF_HL_INFO, z.key, &count, &scalar, -1)
if z.len == 0:
if bcf_hdr_id2type(hdr, BCF_HL_INFO, z.key) == BCF_HT_FLAG:
return value
-cdef object bcf_check_values(VariantRecord record, value, int hl_type, int ht_type,
+cdef object bcf_check_values(VariantRecord record, value, int sample,
+ int hl_type, int ht_type,
int id, int bt_type, ssize_t bt_len,
ssize_t *value_count, int *scalar, int *realloc):
if record is None:
raise ValueError('record must not be None')
- bcf_get_value_count(record, hl_type, id, value_count, scalar)
+ bcf_get_value_count(record, hl_type, id, value_count, scalar, sample)
# Validate values now that we know the type and size
values = (value,) if not isinstance(value, (list, tuple)) else value
# KBJ: htslib lies about the cardinality of GT fields-- they're really VLEN (-1)
value_count[0] = -1
- if value_count[0] != -1 and value_count[0] != len(values):
+ cdef int given = len(values)
+ if value_count[0] != -1 and value_count[0] != given:
if scalar[0]:
- raise TypeError('value expected to be scalar'.format(value_count[0]))
+ raise TypeError('value expected to be scalar, given len={}'.format(value_count[0], given))
else:
- raise TypeError('values expected to be {:d}-tuple'.format(value_count[0]))
+ raise TypeError('values expected to be {}-tuple, given len={}'.format(value_count[0], given))
if ht_type == BCF_HT_REAL:
for v in values:
cdef bcf_hdr_t *hdr = record.header.ptr
cdef bcf1_t *r = record.ptr
- cdef vdict_t *d
- cdef khiter_t k
cdef int info_id, info_type, scalar, dst_type, realloc, vlen = 0
cdef ssize_t i, value_count, alloc_len, alloc_size, dst_size
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if info:
info_id = info.key
else:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
+ info_id = bcf_header_get_info_id(hdr, bkey)
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
- raise KeyError('unknown INFO')
-
- info_id = kh_val_vdict(d, k).id
+ if info_id < 0:
+ raise KeyError('unknown INFO: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
info_type = bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id)
- values = bcf_check_values(record, value, BCF_HL_INFO, info_type, info_id,
+ values = bcf_check_values(record, value, -1,
+ BCF_HL_INFO, info_type, info_id,
info.type if info else -1,
info.len if info else -1,
&value_count, &scalar, &realloc)
vlen = value_count < 0
value_count = len(values)
+ # DISABLED DUE TO ISSUES WITH THE CRAZY POINTERS
# If we can, write updated values to existing allocated storage
- if info and not realloc:
+ if 0 and info and not realloc:
r.d.shared_dirty |= BCF1_DIRTY_INF
if value_count == 0:
info.len = 0
- # FIXME: Check if need to free vptr if info.len > 0?
+ if not info.vptr:
+ info.vptr = <uint8_t *>&info.v1.i
+
elif value_count == 1:
# FIXME: Check if need to free vptr if info.len > 0?
if info.type == BCF_BT_INT8 or info.type == BCF_BT_INT16 or info.type == BCF_BT_INT32:
bcf_object_to_array(values, &info.v1.f, BCF_BT_FLOAT, 1, vlen)
else:
raise TypeError('unsupported info type code')
+
info.len = 1
+ if not info.vptr:
+ info.vptr = <uint8_t *>&info.v1.i
else:
bcf_object_to_array(values, info.vptr, info.type, info.len, vlen)
+
return
alloc_len = max(1, value_count)
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
if not info:
raise KeyError(key)
- bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar)
+ bcf_get_value_count(record, BCF_HL_INFO, info.key, &value_count, &scalar, -1)
if value_count <= 0:
null_value = ()
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
- raise KeyError('invalid FORMAT')
+ raise KeyError('invalid FORMAT: {}'.format(key))
if is_gt_fmt(hdr, fmt.id):
return bcf_format_get_allele_indices(sample)
- bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar)
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &count, &scalar, sample.index)
if fmt.p and fmt.n and fmt.size:
return bcf_array_to_object(fmt.p + sample.index * fmt.size, fmt.type, fmt.n, count, scalar)
if sample is None:
raise ValueError('sample must not be None')
+ if key == 'phased':
+ sample.phased = bool(value)
+ return
+
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
cdef int fmt_id
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if fmt:
k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_FMT] & 0xF == 0xF:
- raise KeyError('unknown format')
+ raise KeyError('unknown format: {}'.format(key))
fmt_id = kh_val_vdict(d, k).id
# KBJ: GT field is considered to be a string by the VCF header but BCF represents it as INT.
fmt_type = BCF_HT_INT
- values = bcf_check_values(sample.record, value, BCF_HL_FMT, fmt_type, fmt_id,
+ values = bcf_check_values(sample.record, value, sample.index,
+ BCF_HL_FMT, fmt_type, fmt_id,
fmt.type if fmt else -1,
fmt.n if fmt else -1,
&value_count, &scalar, &realloc)
if fmt and fmt.n > alloc_len:
alloc_len = fmt.n
- n = bcf_hdr_nsamples(hdr)
+ n = r.n_sample
new_values = bcf_empty_array(fmt_type, n*alloc_len, vlen)
cdef char *valp = <char *>new_values
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
raise KeyError(key)
- bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar)
+ bcf_get_value_count(sample.record, BCF_HL_FMT, fmt.id, &value_count, &scalar, sample.index)
if value_count <= 0:
null_value = ()
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t nsamples = bcf_hdr_nsamples(hdr)
+ cdef int32_t nsamples = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
cdef bcf_hdr_t *hdr = sample.record.header.ptr
cdef bcf1_t *r = sample.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if bcf_unpack(r, BCF_UN_ALL) < 0:
raise ValueError('Error unpacking VariantRecord')
data32[i] = (data32[i] & 0xFFFFFFFE) | phased
+cdef inline bcf_sync_end(VariantRecord record):
+ cdef bcf_hdr_t *hdr = record.header.ptr
+ cdef bcf_info_t *info
+ cdef int end_id = bcf_header_get_info_id(record.header.ptr, b'END')
+ cdef int ref_len = len(record.ref)
+
+ # Delete INFO/END if no alleles are present or if rlen is equal to len(ref)
+ if not record.ptr.n_allele or record.ptr.rlen == ref_len:
+ # If INFO/END is not defined in the header, it doesn't exist in the record
+ if end_id >= 0:
+ info = bcf_get_info(hdr, record.ptr, b'END')
+ if info and info.vptr:
+ if bcf_update_info(hdr, record.ptr, b'END', NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete END')
+ else:
+ # Create END header, if not present
+ if end_id < 0:
+ record.header.info.add('END', number=1, type='Integer', description='Stop position of the interval')
+
+ # Update to reflect stop position
+ bcf_info_set_value(record, b'END', record.ptr.pos + record.ptr.rlen)
+
+
########################################################################
########################################################################
## Variant Header objects
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
cdef bcf_hrec_t *r = self.ptr
if not r:
return
- assert(r.key)
+ assert r.key
cdef char *key = r.key if r.type == BCF_HL_GEN else r.value
- print('Removing header type={} key={} value={} hdr={}'.format(METADATA_TYPES[r.type], r.key, r.value, key))
bcf_hdr_remove(hdr, r.type, key)
self.ptr = NULL
def remove_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- cdef const char *bkey = hdr.id[BCF_DT_ID][self.id].key
- bcf_hdr_remove(hdr, self.type, bkey)
+ cdef const char *key = hdr.id[BCF_DT_ID][self.id].key
+ bcf_hdr_remove(hdr, self.type, key)
cdef VariantMetadata makeVariantMetadata(VariantHeader header, int type, int id):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
- raise KeyError('invalid key')
+ raise KeyError('invalid key: {}'.format(key))
return makeVariantMetadata(self.header, self.type, kh_val_vdict(d, k).id)
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_ID]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d) or kh_val_vdict(d, k).info[self.type] & 0xF == 0xF:
- raise KeyError('invalid key')
+ raise KeyError('invalid key: {}'.format(key))
bcf_hdr_remove(hdr, self.type, bkey)
#bcf_hdr_sync(hdr)
return length if length else None
@property
- def header(self):
+ def header_record(self):
""":class:`VariantHeaderRecord` associated with this :class:`VariantContig` object"""
cdef bcf_hdr_t *hdr = self.header.ptr
cdef bcf_hrec_t *hrec = hdr.id[BCF_DT_CTG][self.id].val.hrec[0]
def remove_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
- cdef const char *bkey = hdr.id[BCF_DT_CTG][self.id].key
- bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+ cdef const char *key = hdr.id[BCF_DT_CTG][self.id].key
+ bcf_hdr_remove(hdr, BCF_HL_CTG, key)
cdef VariantContig makeVariantContig(VariantHeader header, int id):
return makeVariantContig(self.header, index)
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_CTG]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
if k == kh_end(d):
- raise KeyError('invalid contig')
+ raise KeyError('invalid contig: {}'.format(key))
cdef int id = kh_val_vdict(d, k).id
def remove_header(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef int index
- cdef const char *bkey
+ cdef const char *ckey
cdef vdict_t *d
cdef khiter_t k
index = key
if index < 0 or index >= hdr.n[BCF_DT_CTG]:
raise IndexError('invalid contig index')
- bkey = hdr.id[BCF_DT_CTG][self.id].key
+ ckey = hdr.id[BCF_DT_CTG][self.id].key
else:
d = <vdict_t *>hdr.dict[BCF_DT_CTG]
key = force_bytes(key)
if kh_get_vdict(d, key) == kh_end(d):
- raise KeyError('invalid contig')
- bkey = key
+ raise KeyError('invalid contig: {}'.format(key))
+ ckey = key
- bcf_hdr_remove(hdr, BCF_HL_CTG, bkey)
+ bcf_hdr_remove(hdr, BCF_HL_CTG, ckey)
def clear_header(self):
cdef bcf_hdr_t *hdr = self.header.ptr
if id in self:
raise ValueError('Header already exists for contig {}'.format(id))
- items = [('ID', id)] + kwargs.items()
+ items = [('ID', id)]
+ items += kwargs.items()
self.header.add_meta('contig', items=items)
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.header.ptr
cdef vdict_t *d = <vdict_t *>hdr.dict[BCF_DT_SAMPLE]
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef khiter_t k = kh_get_vdict(d, bkey)
return k != kh_end(d)
self.ptr = NULL
def __bool__(self):
- # self.ptr == NULL should be impossible
return self.ptr != NULL
def copy(self):
finally:
free(hstr)
- cpdef VariantRecord new_record(self):
- """Create a new empty VariantRecord"""
- r = makeVariantRecord(self, bcf_init())
- r.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
- return r
+ def new_record(self, contig=None, start=0, stop=0, alleles=None,
+ id=None, qual=None, filter=None, info=None, samples=None,
+ **kwargs):
+ """Create a new empty VariantRecord.
+
+ Arguments are currently experimental. Use with caution and expect
+ changes in upcoming releases.
+
+ """
+ rec = makeVariantRecord(self, bcf_init())
+ rec.ptr.n_sample = bcf_hdr_nsamples(self.ptr)
+
+ if contig is not None:
+ rec.contig = contig
+ if alleles is not None:
+ rec.alleles = alleles
+
+ rec.start = start
+ rec.stop = stop
+ rec.id = id
+ rec.qual = qual
+
+ if filter is not None:
+ if isinstance(filter, (list, tuple, VariantRecordFilter)):
+ for f in filter:
+ rec.filter.add(f)
+ else:
+ rec.filter.add(filter)
+
+ if info:
+ rec.info.update(info)
+
+ if kwargs:
+ if 'GT' in kwargs:
+ rec.samples[0]['GT'] = kwargs.pop('GT')
+ rec.samples[0].update(kwargs)
+
+ if samples:
+ for i, sample in enumerate(samples):
+ if 'GT' in sample:
+ rec.samples[i]['GT'] = sample.pop('GT')
+ rec.samples[i].update(sample)
+
+ return rec
def add_record(self, VariantHeaderRecord record):
"""Add an existing :class:`VariantHeaderRecord` to this header"""
return header
+cdef inline int bcf_header_get_info_id(bcf_hdr_t *hdr, key) except? -2:
+ cdef vdict_t *d
+ cdef khiter_t k
+ cdef int info_id
+
+ if isinstance(key, str):
+ key = force_bytes(key)
+
+ d = <vdict_t *>hdr.dict[BCF_DT_ID]
+ k = kh_get_vdict(d, key)
+
+ if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
+ return -1
+
+ return kh_val_vdict(d, k).id
+
+
########################################################################
########################################################################
## Variant Record objects
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
- raise KeyError('Invalid filter')
+ raise KeyError('Invalid filter: {}'.format(key))
return makeVariantMetadata(self.record.header, BCF_HL_FLT, id)
if key == '.':
key = 'PASS'
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id):
- raise KeyError('Invalid filter')
+ raise KeyError('Invalid filter: {}'.format(key))
bcf_add_filter(hdr, r, id)
id = bcf_hdr_id2int(hdr, BCF_DT_ID, bkey)
if not check_header_id(hdr, BCF_HL_FLT, id) or not bcf_has_filter(hdr, r, bkey):
- raise KeyError('Invalid filter')
+ raise KeyError('Invalid filter: {}'.format(key))
bcf_remove_filter(hdr, r, id, 0)
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
return bcf_has_filter(hdr, r, bkey) == 1
def iterkeys(self):
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def __richcmp__(VariantRecordFilter self not None, VariantRecordFilter other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.record.ptr
+ cdef bcf1_t *o = other.record.ptr
+
+ cdef bint cmp = (s.d.n_flt == o.d.n_flt and list(self) == list(other))
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
- raise KeyError('unknown format')
+ raise KeyError('unknown format: {}'.format(key))
return makeVariantMetadata(self.record.header, BCF_HL_FMT, fmt.id)
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
if not fmt or not fmt.p:
- raise KeyError('unknown format')
+ raise KeyError('unknown format: {}'.format(key))
if bcf_update_format(hdr, r, bkey, fmt.p, 0, fmt.type) < 0:
raise ValueError('Unable to delete FORMAT')
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
return fmt != NULL and fmt.p != NULL
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
- return self.record.ptr.n_info
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef const char *key
+ cdef int i, count = 0
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
+ count += 1
+
+ return count
def __bool__(self):
- return self.record.ptr.n_info != 0
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+ cdef bcf_info_t *info
+ cdef const char *key
+ cdef int i
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ for i in range(r.n_info):
+ info = &r.d.info[i]
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if info != NULL and info.vptr != NULL and strcmp(key, b'END') != 0:
+ return True
+
+ return False
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef vdict_t *d
- cdef khiter_t k
- cdef info_id
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
- cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+ cdef bytes bkey = force_bytes(key)
- if not info:
- d = <vdict_t *>hdr.dict[BCF_DT_ID]
- k = kh_get_vdict(d, bkey)
+ if strcmp(bkey, b'END') == 0:
+ raise KeyError('END is a reserved attribute; access is via record.stop')
- if k == kh_end(d) or kh_val_vdict(d, k).info[BCF_HL_INFO] & 0xF == 0xF:
- raise KeyError('Unknown INFO field: {}'.format(key))
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
- info_id = kh_val_vdict(d, k).id
- else:
- info_id = info.key
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if info_id < 0:
+ raise KeyError('Unknown INFO field: {}'.format(key))
if not check_header_id(hdr, BCF_HL_INFO, info_id):
raise ValueError('Invalid header')
+ # Handle type=Flag values
if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
return info != NULL and info.vptr != NULL
return bcf_info_get_value(self.record, info)
def __setitem__(self, key, value):
+ cdef bytes bkey = force_bytes(key)
+
+ if strcmp(bkey, b'END') == 0:
+ raise KeyError('END is a reserved attribute; access is via record.stop')
+
+ if bcf_unpack(self.record.ptr, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
bcf_info_set_value(self.record, key, value)
def __delitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
+ cdef bytes bkey = force_bytes(key)
+ if strcmp(bkey, b'END') == 0:
+ raise KeyError('END is a reserved attribute; access is via record.stop')
+
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if info_id < 0:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ # Handle flags
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
+ return
+
if not info or not info.vptr:
raise KeyError('Unknown INFO field: {}'.format(key))
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if strcmp(key, b'END') == 0:
+ continue
if bcf_update_info(hdr, r, key, NULL, 0, info.type) < 0:
raise ValueError('Unable to delete INFO')
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
+ cdef const char *key
cdef int i
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
- yield bcf_str_cache_get_charptr(bcf_hdr_int2id(hdr, BCF_DT_ID, info.key))
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if strcmp(key, b'END') != 0:
+ yield bcf_str_cache_get_charptr(key)
def get(self, key, default=None):
"""D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
- try:
- return self[key]
- except KeyError:
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ cdef bytes bkey = force_bytes(key)
+
+ if strcmp(bkey, b'END') == 0:
return default
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ # Handle flags
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG:
+ return info != NULL and info.vptr != NULL
+
+ if not info or not info.vptr:
+ return default
+
+ return bcf_info_get_value(self.record, info)
+
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
if bcf_unpack(r, BCF_UN_INFO) < 0:
raise ValueError('Error unpacking VariantRecord')
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
+
+ if strcmp(bkey, b'END') == 0:
+ return False
+
cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
- return info != NULL
+ return info != NULL and info.vptr != NULL
def iterkeys(self):
"""D.iterkeys() -> an iterator over the keys of D"""
def itervalues(self):
"""D.itervalues() -> an iterator over the values of D"""
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
+ cdef const char *key
cdef int i
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
- yield bcf_info_get_value(self.record, info)
+ key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
+ if strcmp(key, b'END') != 0:
+ yield bcf_info_get_value(self.record, info)
def iteritems(self):
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
cdef bcf_info_t *info
+ cdef const char *key
cdef int i
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
for i in range(r.n_info):
info = &r.d.info[i]
if info and info.vptr:
key = bcf_hdr_int2id(hdr, BCF_DT_ID, info.key)
- value = bcf_info_get_value(self.record, info)
- yield bcf_str_cache_get_charptr(key), value
+ if strcmp(key, b'END') != 0:
+ value = bcf_info_get_value(self.record, info)
+ yield bcf_str_cache_get_charptr(key), value
def keys(self):
"""D.keys() -> list of D's keys"""
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ if k != 'END':
+ self[k] = v
+
+ if kwargs:
+ kwargs.pop('END', None)
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ cdef bcf_hdr_t *hdr = self.record.header.ptr
+ cdef bcf1_t *r = self.record.ptr
+
+ if bcf_unpack(r, BCF_UN_INFO) < 0:
+ raise ValueError('Error unpacking VariantRecord')
+
+ cdef bytes bkey = force_bytes(key)
+ cdef bcf_info_t *info = bcf_get_info(hdr, r, bkey)
+
+ # Cannot stop here if info == NULL, since flags must return False
+ cdef int info_id = bcf_header_get_info_id(hdr, bkey) if not info else info.key
+
+ if info_id < 0:
+ if default is _nothing:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+ return default
+
+ if not check_header_id(hdr, BCF_HL_INFO, info_id):
+ raise ValueError('Invalid header')
+
+ # Handle flags
+ if bcf_hdr_id2type(hdr, BCF_HL_INFO, info_id) == BCF_HT_FLAG and (not info or not info.vptr):
+ return
+
+ if not info or not info.vptr:
+ if default is _nothing:
+ raise KeyError('Unknown INFO field: {}'.format(key))
+ return default
+
+ value = bcf_info_get_value(self.record, info)
+
+ if bcf_update_info(hdr, r, bkey, NULL, 0, info.type) < 0:
+ raise ValueError('Unable to delete INFO')
+
+ return value
+
+ def __richcmp__(VariantRecordInfo self not None, VariantRecordInfo other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.record.ptr
+ cdef bcf1_t *o = other.record.ptr
+
+ # Cannot use n_info as shortcut logic, since null values may remain
+ cdef bint cmp = dict(self) == dict(other)
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
- #TODO: implement __richcmp__
-
cdef VariantRecordInfo makeVariantRecordInfo(VariantRecord record):
if not record:
raise TypeError('this class cannot be instantiated from Python')
def __len__(self):
- return bcf_hdr_nsamples(self.record.header.ptr)
+ return self.record.ptr.n_sample # bcf_hdr_nsamples(self.record.header.ptr)
def __bool__(self):
- return bcf_hdr_nsamples(self.record.header.ptr) != 0
+ return self.record.ptr.n_sample != 0 # bcf_hdr_nsamples(self.record.header.ptr) != 0
def __getitem__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int n = bcf_hdr_nsamples(hdr)
+ cdef int n = self.record.ptr.n_sample
cdef int sample_index
cdef vdict_t *d
cdef khiter_t k
bkey = force_bytes(key)
sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
- raise KeyError('invalid sample name')
+ raise KeyError('invalid sample name: {}'.format(key))
if sample_index < 0 or sample_index >= n:
raise IndexError('invalid sample index')
def __iter__(self):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield charptr_to_str(hdr.samples[i])
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int n = bcf_hdr_nsamples(hdr)
+ cdef int n = self.record.ptr.n_sample
cdef int sample_index
cdef vdict_t *d
cdef khiter_t k
bkey = force_bytes(key)
sample_index = bcf_hdr_id2int(hdr, BCF_DT_SAMPLE, bkey)
if sample_index < 0:
- raise KeyError('invalid sample name')
+ raise KeyError('invalid sample name: {}'.format(key))
return 0 <= sample_index < n
"""D.itervalues() -> an iterator over the values of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield makeVariantRecordSample(self.record, i)
"""D.iteritems() -> an iterator over the (key, value) items of D"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t i, n = bcf_hdr_nsamples(hdr)
+ cdef int32_t i, n = self.record.ptr.n_sample
for i in range(n):
yield (charptr_to_str(hdr.samples[i]), makeVariantRecordSample(self.record, i))
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
+ def __richcmp__(VariantRecordSamples self not None, VariantRecordSamples other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.record.ptr
+ cdef bcf1_t *o = other.record.ptr
+
+ cdef bint cmp = (s.n_sample == o.n_sample and self.values() == other.values())
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
- #TODO: implement __richcmp__
-
cdef VariantRecordSamples makeVariantRecordSamples(VariantRecord record):
if not record:
raise ValueError(msg.format(self.ptr.n_sample, bcf_hdr_nsamples(dst_hdr)))
bcf_translate(dst_hdr, src_hdr, self.ptr)
+ self.header = dst_header
@property
def rid(self):
if p < 1:
raise ValueError('Position must be positive')
self.ptr.pos = p - 1
+ bcf_sync_end(self)
@property
def start(self):
if s < 0:
raise ValueError('Start coordinate must be non-negative')
self.ptr.pos = s
+ bcf_sync_end(self)
@property
def stop(self):
@stop.setter
def stop(self, value):
cdef int s = value
- if s < self.ptr.pos:
- raise ValueError('Stop coordinate must be greater than or equal to start')
+ if s < 0:
+ raise ValueError('Stop coordinate must be non-negative')
self.ptr.rlen = s - self.ptr.pos
- if self.ptr.rlen != len(self.ref) or 'END' in self.info:
- self.info['END'] = s
+ bcf_sync_end(self)
@property
def rlen(self):
- """record length on chrom/contig (typically rec.stop - rec.start unless END info is supplied)"""
+ """record length on chrom/contig (aka rec.stop - rec.start)"""
return self.ptr.rlen
@rlen.setter
def rlen(self, value):
cdef int r = value
- if r < 0:
- raise ValueError('Reference length must be non-negative')
self.ptr.rlen = r
- if r != len(self.ref) or 'END' in self.info:
- self.info['END'] = self.ptr.pos + r
+ bcf_sync_end(self)
@property
def qual(self):
else:
alleles = [value]
self.alleles = alleles
+ self.ptr.rlen = len(value)
+ bcf_sync_end(self)
@property
def alleles(self):
return res
@alleles.setter
- def alleles(self, value):
+ def alleles(self, values):
cdef bcf1_t *r = self.ptr
+
if bcf_unpack(r, BCF_UN_STR) < 0:
raise ValueError('Error unpacking VariantRecord')
- value = [force_bytes(v) for v in value]
- if b'' in value:
+
+ values = [force_bytes(v) for v in values]
+
+ if len(values) < 2:
+ raise ValueError('must set at least 2 alleles')
+
+ if b'' in values:
raise ValueError('cannot set null allele')
- value = b','.join(value)
+
+ value = b','.join(values)
+
if bcf_update_alleles_str(self.header.ptr, r, value) < 0:
raise ValueError('Error updating alleles')
+ self.ptr.rlen = len(values[0])
+ bcf_sync_end(self)
+
@property
def alts(self):
"""tuple of alt alleles"""
raise ValueError('Error unpacking VariantRecord')
return makeVariantRecordSamples(self)
+ def __richcmp__(VariantRecord self not None, VariantRecord other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bcf1_t *s = self.ptr
+ cdef bcf1_t *o = other.ptr
+
+ cdef bint cmp = self is other or (
+ s.pos == o.pos
+ and s.rlen == o.rlen
+ and ((bcf_float_is_missing(s.qual) and bcf_float_is_missing(o.qual))
+ or s.qual == o.qual)
+ and s.n_sample == o.n_sample
+ and s.n_allele == o.n_allele
+ and self.contig == other.contig
+ and self.alleles == other.alleles
+ and self.id == other.id
+ and self.info == other.info
+ and self.filter == other.filter
+ and self.samples == other.samples)
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
def __str__(self):
cdef kstring_t line
cdef char c
"""sample name"""
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- cdef int32_t n = bcf_hdr_nsamples(hdr)
+ cdef int32_t n = r.n_sample
if self.index < 0 or self.index >= n:
raise ValueError('invalid sample index')
def __contains__(self, key):
cdef bcf_hdr_t *hdr = self.record.header.ptr
cdef bcf1_t *r = self.record.ptr
- bkey = force_bytes(key)
+ cdef bytes bkey = force_bytes(key)
cdef bcf_fmt_t *fmt = bcf_get_fmt(hdr, r, bkey)
return fmt != NULL and fmt.p != NULL
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
+ def __richcmp__(VariantRecordSample self not None, VariantRecordSample other not None, int op):
+ if op != 2 and op != 3:
+ return NotImplemented
+
+ cdef bint cmp = dict(self) == dict(other)
+
+ if op == 3:
+ cmp = not cmp
+
+ return cmp
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
- #TODO: implement __richcmp__
-
cdef VariantRecordSample makeVariantRecordSample(VariantRecord record, int32_t sample_index):
if not record or sample_index < 0:
"""D.values() -> list of D's values"""
return list(self.itervalues())
+ def update(self, items=None, **kwargs):
+ """D.update([E, ]**F) -> None.
+
+ Update D from dict/iterable E and F.
+ """
+ for k, v in items.items():
+ self[k] = v
+
+ if kwargs:
+ for k, v in kwargs.items():
+ self[k] = v
+
+ def pop(self, key, default=_nothing):
+ try:
+ value = self[key]
+ del self[key]
+ return value
+ except KeyError:
+ if default is not _nothing:
+ return default
+ raise
+
# Mappings are not hashable by default, but subclasses can change this
__hash__ = None
try:
rid = index.refmap[contig]
except KeyError:
- raise ValueError('Unknown contig specified')
+ raise ValueError('Unknown contig specified: {}'.format(contig))
if start is None:
start = 0
cdef class VariantFile(HTSFile):
"""*(filename, mode=None, index_filename=None, header=None, drop_samples=False,
- duplicate_filehandle=True)*
+ duplicate_filehandle=True, ignore_truncation=False)*
A :term:`VCF`/:term:`BCF` formatted file. The file is automatically
opened.
drop_samples: bool
Ignore sample information when reading.
- duplicate_filehandle: bool
+ duplicate_filehandle: bool
By default, file handles passed either directly or through
File-like objects will be duplicated before passing them to
htslib. The duplication prevents issues where the same stream
high-level python object. Set to False to turn off
duplication.
+ ignore_truncation: bool
+ Issue a warning, instead of raising an error if the current file
+ appears to be truncated due to a missing EOF marker. Only applies
+ to bgzipped formats. (Default=False)
+
"""
def __cinit__(self, *args, **kwargs):
self.htsfile = NULL
self.open(*args, **kwargs)
+ def __dealloc__(self):
+ if not self.htsfile or not self.header:
+ return
+
+ # Write header if no records were written
+ if self.htsfile.is_write and not self.header_written:
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ cdef int ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+ self.header = self.index = None
+
+ if ret < 0:
+ global errno
+ if errno == EPIPE:
+ errno = 0
+ else:
+ raise OSError(errno, force_str(strerror(errno)))
+
def close(self):
"""closes the :class:`pysam.VariantFile`."""
- cdef int ret = 0
- self.header = self.index = None
- if self.htsfile:
- # Write header if no records were written
- if self.htsfile.is_write and not self.header_written:
- self.header_written = True
- with nogil:
- bcf_hdr_write(self.htsfile, self.header.ptr)
+ if not self.htsfile:
+ return
- ret = hts_close(self.htsfile)
- self.htsfile = NULL
+ # Write header if no records were written
+ if self.htsfile.is_write and not self.header_written:
+ with nogil:
+ bcf_hdr_write(self.htsfile, self.header.ptr)
+
+ cdef int ret = hts_close(self.htsfile)
+ self.htsfile = NULL
+ self.header = self.index = None
if ret < 0:
global errno
if ret == -1:
raise StopIteration
elif ret == -2:
- raise IOError('truncated file')
+ raise OSError('truncated file')
else:
raise ValueError('Variant read failed')
index_filename=None,
VariantHeader header=None,
drop_samples=False,
- duplicate_filehandle=True):
+ duplicate_filehandle=True,
+ ignore_truncation=False):
"""open a vcf/bcf file.
If open is called on an existing VariantFile, the current file will be
elif mode.startswith(b'r'):
# open file for reading
-
if not self._exists():
raise IOError('file `{}` not found'.format(filename))
if self.htsfile.format.format not in (bcf, vcf):
raise ValueError("invalid file `{}` (mode='{}') - is it VCF/BCF format?".format(filename, mode))
- if self.htsfile.format.compression == bgzf:
- bgzfp = hts_get_bgzfp(self.htsfile)
- if bgzfp and bgzf_check_EOF(bgzfp) == 0:
- warn('[%s] Warning: no BGZF EOF marker; file may be truncated'.format(filename))
+ self.check_truncation(ignore_truncation)
with nogil:
hdr = bcf_hdr_read(self.htsfile)
"""reset file position to beginning of file just after the header."""
return self.seek(self.start_offset)
-
def fetch(self, contig=None, start=None, stop=None, region=None, reopen=False):
"""fetch records in a :term:`region` using 0-based indexing. The
region is specified by :term:`contig`, *start* and *end*.
self.is_reading = 1
return self.index.fetch(self, contig, start, stop, region, reopen)
- cpdef VariantRecord new_record(self):
- """Create a new empty VariantRecord"""
- return self.header.new_record()
+ def new_record(self, *args, **kwargs):
+ """Create a new empty :class:`VariantRecord`.
+
+ See :meth:`VariantHeader.new_record`
+ """
+ return self.header.new_record(*args, **kwargs)
cpdef int write(self, VariantRecord record) except -1:
"""
msg = 'Invalid VariantRecord. Number of samples does not match header ({} vs {})'
raise ValueError(msg.format(record.ptr.n_sample, bcf_hdr_nsamples(self.header.ptr)))
+ # Sync END annotation before writing
+ bcf_sync_end(record)
+
cdef int ret
with nogil:
--- /dev/null
+cdef extern from "cbcftools_util.h":
+
+ int bcftools_main(int argc, char *argv[])
--- /dev/null
+def py_bcftools():
+ pass
from cpython.object cimport PyObject
from cpython.bytes cimport PyBytes_FromStringAndSize, _PyBytes_Resize
-from pysam.libcutils cimport force_bytes, force_str, charptr_to_str, charptr_to_str_w_len
-from pysam.libchtslib cimport *
-
+from pysam.libcutils cimport force_bytes, encode_filename
+from pysam.libchtslib cimport bgzf_open, bgzf_index_build_init, bgzf_write, bgzf_read, \
+ bgzf_flush, bgzf_index_dump, bgzf_close, bgzf_seek, \
+ bgzf_tell, bgzf_getline, kstring_t, SEEK_SET, BGZF
__all__ = ["BGZFile"]
compressed file in text mode, use the gzip.open() function.
"""
cdef BGZF* bgzf
- cdef bytes name, index
+ cdef readonly object name, index
def __init__(self, filename, mode=None, index=None):
"""Constructor for the BGZFile class.
raise ValueError("Invalid mode: {!r}".format(mode))
if not mode:
mode = 'rb'
- if mode and 'b' not in mode:
+ elif mode and 'b' not in mode:
mode += 'b'
- self.name = force_bytes(filename)
- self.index = force_bytes(index) if index is not None else None
+
+ mode = force_bytes(mode)
+
+ self.name = encode_filename(filename)
+ self.index = encode_filename(index) if index is not None else None
+
self.bgzf = bgzf_open(self.name, mode)
if self.bgzf.is_write and index is not None and bgzf_index_build_init(self.bgzf) < 0:
def __dealloc__(self):
self.close()
- def write(self,data):
+ def write(self, data):
if not self.bgzf:
raise ValueError("write() on closed BGZFile object")
def seekable(self):
return True
+ def tell(self):
+ if not self.bgzf:
+ raise ValueError("seek() on closed BGZFile object")
+ cdef int64_t off = bgzf_tell(self.bgzf)
+ if off < 0:
+ raise IOError('Error in tell on BGZFFile object')
+
+ return off
+
def seek(self, offset, whence=io.SEEK_SET):
if not self.bgzf:
raise ValueError("seek() on closed BGZFile object")
line.l = line.m = 0
line.s = NULL
- if bgzf_getline(self.bgzf, '\n', &line) < 0:
- raise IOError('Error reading line in BGZFFile object')
- ret = charptr_to_str_w_len(line.s, line.l)
+ cdef int ret = bgzf_getline(self.bgzf, '\n', &line)
+ if ret == -1:
+ s = b''
+ elif ret == -2:
+ if line.m:
+ free(line.s)
+ raise IOError('Error reading line in BGZFFile object')
+ else:
+ s = line.s[:line.l]
if line.m:
free(line.s)
- return ret
+ return s
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ line = self.readline()
+ if not line:
+ raise StopIteration()
+ return line
from pysam.libchtslib cimport \
faidx_nseq, fai_load, fai_destroy, fai_fetch, \
- faidx_seq_len, \
+ faidx_seq_len, faidx_iseq, faidx_seq_len, \
faidx_fetch_seq, hisremote, \
bgzf_open, bgzf_close
if self.fastafile == NULL:
raise IOError("could not open file `%s`" % filename)
- if self.is_remote:
- filepath_index = os.path.basename(
- re.sub("[^:]+:[/]*", "", filename)) + ".fai"
- elif filepath_index is None:
- filepath_index = filename + ".fai"
-
- if not os.path.exists(filepath_index):
- raise ValueError("could not locate index file {}".format(
- filepath_index))
-
- with open(filepath_index) as inf:
- data = [x.split("\t") for x in inf]
- self._references = tuple(x[0] for x in data)
- self._lengths = tuple(int(x[1]) for x in data)
- self.reference2length = dict(zip(self._references, self._lengths))
+ cdef int nreferences = faidx_nseq(self.fastafile)
+ cdef int x
+ cdef const char * s
+ self._references = []
+ self._lengths = []
+ for x from 0 <= x < nreferences:
+ s = faidx_iseq(self.fastafile, x)
+ ss = force_str(s)
+ self._references.append(ss)
+ self._lengths.append(faidx_seq_len(self.fastafile, s))
+ self.reference2length = dict(zip(self._references, self._lengths))
def close(self):
"""close the file."""
... print(entry.sequence)
... print(entry.comment)
... print(entry.quality)
+ >>> with pysam.FastxFile(filename) as fin, open(out_filename, mode='w') as fout:
+ ... for entry in fin:
+ ... fout.write(str(entry))
"""
def __cinit__(self, *args, **kwargs):
FILE* PyFile_AsFile(object)
+# cython does not wrap stdarg
+cdef extern from "stdarg.h":
+ ctypedef struct va_list:
+ pass
+
+
cdef extern from "htslib/kstring.h" nogil:
ctypedef struct kstring_t:
size_t l, m
# @abstract Open the named file or URL as a stream
# @return An hFILE pointer, or NULL (with errno set) if an error occurred.
- hFILE *hopen(const char *filename, const char *mode)
+ hFILE *hopen(const char *filename, const char *mode, ...)
# @abstract Associate a stream with an existing open file descriptor
# @return An hFILE pointer, or NULL (with errno set) if an error occurred.
# @return The character read, or EOF on end-of-file or error
int hgetc(hFILE *fp)
+ # Read from the stream until the delimiter, up to a maximum length
+ # @param buffer The buffer into which bytes will be written
+ # @param size The size of the buffer
+ # @param delim The delimiter (interpreted as an `unsigned char`)
+ # @param fp The file stream
+ # @return The number of bytes read, or negative on error.
+ # @since 1.4
+ #
+ # Bytes will be read into the buffer up to and including a delimiter, until
+ # EOF is reached, or _size-1_ bytes have been written, whichever comes first.
+ # The string will then be terminated with a NUL byte (`\0`).
+ ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp)
+
+ # Read a line from the stream, up to a maximum length
+ # @param buffer The buffer into which bytes will be written
+ # @param size The size of the buffer
+ # @param fp The file stream
+ # @return The number of bytes read, or negative on error.
+ # @since 1.4
+ #
+ # Specialization of hgetdelim() for a `\n` delimiter.
+ ssize_t hgetln(char *buffer, size_t size, hFILE *fp)
+
+ # Read a line from the stream, up to a maximum length
+ # @param buffer The buffer into which bytes will be written
+ # @param size The size of the buffer (must be > 1 to be useful)
+ # @param fp The file stream
+ # @return _buffer_ on success, or `NULL` if an error occurred.
+ # @since 1.4
+ #
+ # This function can be used as a replacement for `fgets(3)`, or together with
+ # kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_.
+ char *hgets(char *buffer, int size, hFILE *fp)
+
# @abstract Peek at characters to be read without removing them from buffers
# @param fp The file stream
# @param buffer The buffer to which the peeked bytes will be written
# @return The index, or NULL if an error occurred.
hts_idx_t *hts_idx_load2(const char *fn, const char *fnidx)
- uint8_t *hts_idx_get_meta(hts_idx_t *idx, int *l_meta)
+ uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta)
void hts_idx_set_meta(hts_idx_t *idx, int l_meta, uint8_t *meta, int is_copy)
int hts_idx_get_stat(const hts_idx_t* idx, int tid,
int hts_file_type(const char *fname)
+ # /***************************
+ # * Revised MAQ error model *
+ # ***************************/
+
+ ctypedef struct errmod_t
+
+ errmod_t *errmod_init(double depcorr)
+ void errmod_destroy(errmod_t *em)
+
+ # /*
+ # n: number of bases
+ # m: maximum base
+ # bases[i]: qual:6, strand:1, base:4
+ # q[i*m+j]: phred-scaled likelihood of (i,j)
+ # */
+ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *Probabilistic)
+
+ # /*****************************************
+ # * q banded glocal alignment *
+ # *****************************************/
+
+ ctypedef struct probaln_par_t:
+ float d, e
+ int bw;
+
+ int probaln_glocal(const uint8_t *ref,
+ int l_ref,
+ const uint8_t *query,
+ int l_query, const uint8_t *iqual,
+ const probaln_par_t *c,
+ int *state, uint8_t *q)
+
+ # /**********************
+ # * MD5 implementation *
+ # **********************/
+
+ ctypedef struct hts_md5_context
+
+ # /*! @abstract Intialises an MD5 context.
+ # * @discussion
+ # * The expected use is to allocate an hts_md5_context using
+ # * hts_md5_init(). This pointer is then passed into one or more calls
+ # * of hts_md5_update() to compute successive internal portions of the
+ # * MD5 sum, which can then be externalised as a full 16-byte MD5sum
+ # * calculation by calling hts_md5_final(). This can then be turned
+ # * into ASCII via hts_md5_hex().
+ # *
+ # * To dealloate any resources created by hts_md5_init() call the
+ # * hts_md5_destroy() function.
+ # *
+ # * @return hts_md5_context pointer on success, NULL otherwise.
+ # */
+ hts_md5_context *hts_md5_init()
+
+ # /*! @abstract Updates the context with the MD5 of the data. */
+ void hts_md5_update(hts_md5_context *ctx, const void *data, unsigned long size)
+
+ # /*! @abstract Computes the final 128-bit MD5 hash from the given context */
+ void hts_md5_final(unsigned char *digest, hts_md5_context *ctx)
+
+ # /*! @abstract Resets an md5_context to the initial state, as returned
+ # * by hts_md5_init().
+ # */
+ void hts_md5_reset(hts_md5_context *ctx)
+
+ # /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
+ # * hex string.
+ # */
+ void hts_md5_hex(char *hex, const unsigned char *digest)
+
+ # /*! @abstract Deallocates any memory allocated by hts_md5_init. */
+ void hts_md5_destroy(hts_md5_context *ctx)
+
inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
inline int hts_bin_bot(int bin, int n_lvls)
uint8_t qual
uint8_t l_qname
uint16_t flag
- uint16_t n_cigar
+ uint8_t unused1
+ uint8_t l_extranul
+ uint32_t n_cigar
int32_t l_qseq
int32_t mtid
int32_t mpos
#*************************************
uint8_t *bam_aux_get(const bam1_t *b, const char *tag)
- int32_t bam_aux2i(const uint8_t *s)
+ int64_t bam_aux2i(const uint8_t *s)
double bam_aux2f(const uint8_t *s)
char bam_aux2A(const uint8_t *s)
char *bam_aux2Z(const uint8_t *s)
#*** Pileup and Mpileup ***
#**************************
+ # @abstract Generic pileup 'client data'.
+ # @discussion The pileup iterator allows setting a constructor and
+ # destructor function, which will be called every time a sequence is
+ # fetched and discarded. This permits caching of per-sequence data in
+ # a tidy manner during the pileup process. This union is the cached
+ # data to be manipulated by the "client" (the caller of pileup).
+ #
+ union bam_pileup_cd:
+ void *p
+ int64_t i
+ double f
+
# @abstract Structure for one alignment covering the pileup position.
# @field b pointer to the alignment
# @field qpos position of the read base at the pileup site, 0-based
uint32_t is_tail
uint32_t is_refskip
uint32_t aux
+ bam_pileup_cd cd
ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
ctypedef int (*bam_test_f)()
# Added by AH
# ctypedef bam_pileup1_t * const_bam_pileup1_t_ptr "const bam_pileup1_t *"
+ # ***********************************
+ # * BAQ calculation and realignment *
+ # ***********************************/
+ int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres)
+ int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag)
+
cdef extern from "htslib/faidx.h" nogil:
ctypedef struct faidx_t:
pass
+ # /// Build index for a FASTA or bgzip-compressed FASTA file.
+ # /** @param fn FASTA file name
+ # @param fnfai Name of .fai file to build.
+ # @param fngzi Name of .gzi file to build (if fn is bgzip-compressed).
+ # @return 0 on success; or -1 on failure
+
+ # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
+ # If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI
+ # file will only be built if fn is bgzip-compressed.
+ # */
+ int fai_build3(const char *fn,
+ const char *fnfai,
+ const char *fngzi)
+
+ # /// Build index for a FASTA or bgzip-compressed FASTA file.
+ # /** @param fn FASTA file name
+ # @return 0 on success; or -1 on failure
+ #
+ # File "fn.fai" will be generated. This function is equivalent to
+ # fai_build3(fn, NULL, NULL);
+ # */
int fai_build(char *fn)
+ # /// Destroy a faidx_t struct
void fai_destroy(faidx_t *fai)
+ # /// Load FASTA indexes.
+ # /** @param fn File name of the FASTA file (can be compressed with bgzip).
+ # @param fnfai File name of the FASTA index.
+ # @param fngzi File name of the bgzip index.
+ # @param flags Option flags to control index file caching and creation.
+ # @return Pointer to a faidx_t struct on success, NULL on failure.
+
+ # If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
+ # If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
+ # The bgzip index is only needed if fn is compressed.
+
+ # If (flags & FAI_CREATE) is true, the index files will be built using
+ # fai_build3() if they are not already present.
+ # */
+ faidx_t *fai_load3(const char *fn,
+ const char *fnfai,
+ const char *fngzi,
+ int flags)
+
+ # /// Load index from "fn.fai".
+ # /** @param fn File name of the FASTA file
+ # @return Pointer to a faidx_t struct on success, NULL on failure.
+ # This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
+ # */
faidx_t *fai_load(char *fn)
+ # /// Fetch the sequence in a region
+ # /** @param fai Pointer to the faidx_t struct
+ # @param reg Region in the format "chr2:20,000-30,000"
+ # @param len Length of the region; -2 if seq not present, -1 general error
+ # @return Pointer to the sequence; `NULL` on failure
+ # The returned sequence is allocated by `malloc()` family and should be destroyed
+ # by end users by calling `free()` on it.
+ # */
char *fai_fetch(faidx_t *fai,
char *reg,
int *len)
- int faidx_nseq(faidx_t *fai)
-
- int faidx_has_seq(faidx_t *fai, const char *seq)
-
+ # /// Fetch the sequence in a region
+ # /** @param fai Pointer to the faidx_t struct
+ # @param c_name Region name
+ # @param p_beg_i Beginning position number (zero-based)
+ # @param p_end_i End position number (zero-based)
+ # @param len Length of the region; -2 if c_name not present, -1 general error
+ # @return Pointer to the sequence; null on failure
+ # The returned sequence is allocated by `malloc()` family and should be destroyed
+ # by end users by calling `free()` on it.
+ # */
char *faidx_fetch_seq(faidx_t *fai,
char *c_name,
int p_beg_i,
int p_end_i,
int *len)
- int faidx_seq_len(faidx_t *fai, const char *seq)
+ # /// Query if sequence is present
+ # /** @param fai Pointer to the faidx_t struct
+ # @param seq Sequence name
+ # @return 1 if present or 0 if absent
+ # */
+ int faidx_has_seq(faidx_t *fai, const char *seq)
+
+ # /// Fetch the number of sequences
+ # /** @param fai Pointer to the faidx_t struct
+ # @return The number of sequences
+ # */
+ int faidx_nseq(const faidx_t *fai)
+ # /// Return name of i-th sequence
+ const char *faidx_iseq(const faidx_t *fai, int i)
+
+ # /// Return sequence length, -1 if not present
+ int faidx_seq_len(faidx_t *fai, const char *seq)
# tabix support
cdef extern from "htslib/tbx.h" nogil:
int bcf_get_format_int32(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, int32_t **dst, int *ndst)
int bcf_get_format_float(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, float **dst, int *ndst)
int bcf_get_format_char(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char **dst, int *ndst)
- int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int **dst, int *ndst)
+ int bcf_get_genotypes(const bcf_hdr_t *hdr, bcf1_t *line, int32_t **dst, int *ndst)
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
uint32_t bcf_ij2G(uint32_t i, uint32_t j)
+cdef extern from "htslib/cram.h" nogil:
+
+ enum cram_block_method:
+ ERROR
+ RAW
+ GZIP
+ BZIP2
+ LZMA
+ RANS
+ RANS0
+ RANS1
+ GZIP_RLE
+
+ enum cram_content_type:
+ CT_ERROR
+ FILE_HEADER
+ COMPRESSION_HEADER
+ MAPPED_SLICE
+ UNMAPPED_SLICE
+ EXTERNAL
+ CORE
+
+ # Opaque data types, see cram_structs for the fully fledged versions.
+ ctypedef struct SAM_hdr
+ ctypedef struct cram_file_def
+ ctypedef struct cram_fd
+ ctypedef struct cram_container
+ ctypedef struct cram_block
+ ctypedef struct cram_slice
+ ctypedef struct cram_metrics
+ ctypedef struct cram_block_slice_hdr
+ ctypedef struct cram_block_compression_hdr
+ ctypedef struct refs_t
+
+ # Accessor functions
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_fd
+ #
+ SAM_hdr *cram_fd_get_header(cram_fd *fd)
+ void cram_fd_set_header(cram_fd *fd, SAM_hdr *hdr)
+
+ int cram_fd_get_version(cram_fd *fd)
+ void cram_fd_set_version(cram_fd *fd, int vers)
+
+ int cram_major_vers(cram_fd *fd)
+ int cram_minor_vers(cram_fd *fd)
+
+ hFILE *cram_fd_get_fp(cram_fd *fd)
+ void cram_fd_set_fp(cram_fd *fd, hFILE *fp)
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_container
+ #
+ int32_t cram_container_get_length(cram_container *c)
+ void cram_container_set_length(cram_container *c, int32_t length)
+ int32_t cram_container_get_num_blocks(cram_container *c)
+ void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks)
+ int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks)
+ void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks,
+ int32_t *landmarks)
+
+ # Returns true if the container is empty (EOF marker) */
+ int cram_container_is_empty(cram_fd *fd)
+
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_block
+ #
+ int32_t cram_block_get_content_id(cram_block *b)
+ int32_t cram_block_get_comp_size(cram_block *b)
+ int32_t cram_block_get_uncomp_size(cram_block *b)
+ int32_t cram_block_get_crc32(cram_block *b)
+ void * cram_block_get_data(cram_block *b)
+
+ cram_content_type cram_block_get_content_type(cram_block *b)
+
+ void cram_block_set_content_id(cram_block *b, int32_t id)
+ void cram_block_set_comp_size(cram_block *b, int32_t size)
+ void cram_block_set_uncomp_size(cram_block *b, int32_t size)
+ void cram_block_set_crc32(cram_block *b, int32_t crc)
+ void cram_block_set_data(cram_block *b, void *data)
+
+ int cram_block_append(cram_block *b, void *data, int size)
+ void cram_block_update_size(cram_block *b)
+
+ # Offset is known as "size" internally, but it can be confusing.
+ size_t cram_block_get_offset(cram_block *b)
+ void cram_block_set_offset(cram_block *b, size_t offset)
+
+ #
+ # Computes the size of a cram block, including the block
+ # header itself.
+ #
+ uint32_t cram_block_size(cram_block *b)
+
+ #
+ # Renumbers RG numbers in a cram compression header.
+ #
+ # CRAM stores RG as the Nth number in the header, rather than a
+ # string holding the ID: tag. This is smaller in space, but means
+ # "samtools cat" to join files together that contain single but
+ # different RG lines needs a way of renumbering them.
+ #
+ # The file descriptor is expected to be immediately after the
+ # cram_container structure (ie before the cram compression header).
+ # Due to the nature of the CRAM format, this needs to read and write
+ # the blocks itself. Note that there may be multiple slices within
+ # the container, meaning multiple compression headers to manipulate.
+ # Changing RG may change the size of the compression header and
+ # therefore the length field in the container. Hence we rewrite all
+ # blocks just incase and also emit the adjusted container.
+ #
+ # The current implementation can only cope with renumbering a single
+ # RG (and only then if it is using HUFFMAN or BETA codecs). In
+ # theory it *may* be possible to renumber multiple RGs if they use
+ # HUFFMAN to the CORE block or use an external block unshared by any
+ # other data series. So we have an API that can be upgraded to
+ # support this, but do not implement it for now. An example
+ # implementation of RG as an EXTERNAL block would be to find that
+ # block and rewrite it, returning the number of blocks consumed.
+ #
+ # Returns 0 on success;
+ # -1 if unable to edit;
+ # -2 on other errors (eg I/O).
+ #
+ int cram_transcode_rg(cram_fd *input, cram_fd *output,
+ cram_container *c,
+ int nrg, int *in_rg, int *out_rg)
+
+ #
+ # Copies the blocks representing the next num_slice slices from a
+ # container from 'in' to 'out'. It is expected that the file pointer
+ # is just after the read of the cram_container and cram compression
+ # header.
+ #
+ # Returns 0 on success
+ # -1 on failure
+ #
+ int cram_copy_slice(cram_fd *input, cram_fd *output, int32_t num_slice)
+
+ #
+ #-----------------------------------------------------------------------------
+ # SAM_hdr
+ #
+
+ # Tokenises a SAM header into a hash table.
+ #
+ # Also extracts a few bits on specific data types, such as @RG lines.
+ #
+ # @return
+ # Returns a SAM_hdr struct on success (free with sam_hdr_free())
+ # NULL on failure
+ #
+ SAM_hdr *sam_hdr_parse_(const char *hdr, int len)
+
+
+ #
+ #-----------------------------------------------------------------------------
+ # cram_io basics
+ #
+
+ # CRAM blocks - the dynamically growable data block. We have code to
+ # create, update, (un)compress and read/write.
+ #
+ # These are derived from the deflate_interlaced.c blocks, but with the
+ # CRAM extension of content types and IDs.
+ #
+
+ # Allocates a new cram_block structure with a specified content_type and
+ # id.
+ #
+ # @return
+ # Returns block pointer on success;
+ # NULL on failure
+ #
+ cram_block *cram_new_block(cram_content_type content_type,
+ int content_id)
+
+ # Reads a block from a cram file.
+ #
+ # @return
+ # Returns cram_block pointer on success;
+ # NULL on failure
+ #
+ cram_block *cram_read_block(cram_fd *fd)
+
+ # Writes a CRAM block.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_write_block(cram_fd *fd, cram_block *b)
+
+ # Frees a CRAM block, deallocating internal data too.
+ #
+ void cram_free_block(cram_block *b)
+
+ # Uncompresses a CRAM block, if compressed.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_uncompress_block(cram_block *b)
+
+ # Compresses a block.
+ #
+ # Compresses a block using one of two different zlib strategies. If we only
+ # want one choice set strat2 to be -1.
+ #
+ # The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
+ # or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
+ # significantly faster.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
+ int method, int level)
+
+ # Containers
+ #
+
+ # Creates a new container, specifying the maximum number of slices
+ # and records permitted.
+ #
+ # @return
+ # Returns cram_container ptr on success;
+ # NULL on failure
+ #
+ cram_container *cram_new_container(int nrec, int nslice)
+ void cram_free_container(cram_container *c)
+
+ # Reads a container header.
+ #
+ # @return
+ # Returns cram_container on success;
+ # NULL on failure or no container left (fd->err == 0).
+ #
+ cram_container *cram_read_container(cram_fd *fd)
+
+ # Writes a container structure.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_write_container(cram_fd *fd, cram_container *h)
+
+ #
+ # Stores the container structure in dat and returns *size as the
+ # number of bytes written to dat[]. The input size of dat is also
+ # held in *size and should be initialised to cram_container_size(c).
+ #
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
+
+ int cram_container_size(cram_container *c)
+
+ # The top-level cram opening, closing and option handling
+ #
+
+ # Opens a CRAM file for read (mode "rb") or write ("wb").
+ #
+ # The filename may be "-" to indicate stdin or stdout.
+ #
+ # @return
+ # Returns file handle on success;
+ # NULL on failure.
+ #
+ cram_fd *cram_open(const char *filename, const char *mode)
+
+ # Opens an existing stream for reading or writing.
+ #
+ # @return
+ # Returns file handle on success;
+ # NULL on failure.
+ #
+ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode)
+
+ # Closes a CRAM file.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_close(cram_fd *fd)
+
+ #
+ # Seek within a CRAM file.
+ #
+ # Returns 0 on success
+ # -1 on failure
+ #
+ int cram_seek(cram_fd *fd, off_t offset, int whence)
+
+ #
+ # Flushes a CRAM file.
+ # Useful for when writing to stdout without wishing to close the stream.
+ #
+ # Returns 0 on success
+ # -1 on failure
+ #
+ int cram_flush(cram_fd *fd)
+
+ # Checks for end of file on a cram_fd stream.
+ #
+ # @return
+ # Returns 0 if not at end of file
+ # 1 if we hit an expected EOF (end of range or EOF block)
+ # 2 for other EOF (end of stream without EOF block)
+ #
+ int cram_eof(cram_fd *fd)
+
+ # Sets options on the cram_fd.
+ #
+ # See CRAM_OPT_* definitions in hts.h.
+ # Use this immediately after opening.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_set_option(cram_fd *fd, hts_fmt_option opt, ...)
+
+ # Sets options on the cram_fd.
+ #
+ # See CRAM_OPT_* definitions in hts.h.
+ # Use this immediately after opening.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_set_voption(cram_fd *fd, hts_fmt_option opt, va_list args)
+
+ #
+ # Attaches a header to a cram_fd.
+ #
+ # This should be used when creating a new cram_fd for writing where
+ # we have an SAM_hdr already constructed (eg from a file we've read
+ # in).
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int cram_set_header(cram_fd *fd, SAM_hdr *hdr)
+
+ # Check if this file has a proper EOF block
+ #
+ # @return
+ # Returns 3 if the file is a version of CRAM that does not contain EOF blocks
+ # 2 if the file is a stream and thus unseekable
+ # 1 if the file contains an EOF block
+ # 0 if the file does not contain an EOF block
+ # -1 if an error occured whilst reading the file or we could not seek back to where we were
+ #
+ #
+ int cram_check_EOF(cram_fd *fd)
+
+ # As int32_decoded/encode, but from/to blocks instead of cram_fd */
+ int int32_put_blk(cram_block *b, int32_t val)
+
+ # Deallocates all storage used by a SAM_hdr struct.
+ #
+ # This also decrements the header reference count. If after decrementing
+ # it is still non-zero then the header is assumed to be in use by another
+ # caller and the free is not done.
+ #
+ # This is a synonym for sam_hdr_dec_ref().
+ #
+ void sam_hdr_free(SAM_hdr *hdr)
+
+ # Returns the current length of the SAM_hdr in text form.
+ #
+ # Call sam_hdr_rebuild() first if editing has taken place.
+ #
+ int sam_hdr_length(SAM_hdr *hdr)
+
+ # Returns the string form of the SAM_hdr.
+ #
+ # Call sam_hdr_rebuild() first if editing has taken place.
+ #
+ char *sam_hdr_str(SAM_hdr *hdr)
+
+ # Appends a formatted line to an existing SAM header.
+ #
+ # Line is a full SAM header record, eg "@SQ\tSN:foo\tLN:100", with
+ # optional new-line. If it contains more than 1 line then multiple lines
+ # will be added in order.
+ #
+ # Len is the length of the text data, or 0 if unknown (in which case
+ # it should be null terminated).
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+
+ # Add an @PG line.
+ #
+ # If we wish complete control over this use sam_hdr_add() directly. This
+ # function uses that, but attempts to do a lot of tedious house work for
+ # you too.
+ #
+ # - It will generate a suitable ID if the supplied one clashes.
+ # - It will generate multiple @PG records if we have multiple PG chains.
+ #
+ # Call it as per sam_hdr_add() with a series of key,value pairs ending
+ # in NULL.
+ #
+ # @return
+ # Returns 0 on success;
+ # -1 on failure
+ #
+ int sam_hdr_add_PG(SAM_hdr *sh, const char *name, ...)
+
+ #
+ # A function to help with construction of CL tags in @PG records.
+ # Takes an argc, argv pair and returns a single space-separated string.
+ # This string should be deallocated by the calling function.
+ #
+ # @return
+ # Returns malloced char * on success;
+ # NULL on failure
+ #
+ char *stringify_argv(int argc, char *argv[])
+
+ #
+ # Returns the refs_t structure used by a cram file handle.
+ #
+ # This may be used in conjunction with option CRAM_OPT_SHARED_REF to
+ # share reference memory between multiple file handles.
+ #
+ # @return
+ # Returns NULL if none exists or the file handle is not a CRAM file.
+ #
+ refs_t *cram_get_refs(htsFile *fd)
+
+
cdef class HTSFile(object):
cdef htsFile *htsfile # pointer to htsFile structure
cdef int64_t start_offset # BGZF offset of first record
# cython: profile=True
# adds doc-strings for sphinx
import os
+import io
from posix.unistd cimport dup
+from libc.errno cimport errno
+from cpython cimport PyBytes_FromStringAndSize
from pysam.libchtslib cimport *
from pysam.libcutils cimport encode_filename, from_string_and_size
-__all__ = ["get_verbosity", "set_verbosity"]
+from warnings import warn
+__all__ = ['get_verbosity', 'set_verbosity', 'HFile', 'HTSFile']
+
+# defines imported from samtools
+DEF SEEK_SET = 0
+DEF SEEK_CUR = 1
+DEF SEEK_END = 2
+
########################################################################
########################################################################
## Constants
########################################################################
+# maximum genomic coordinace
cdef int MAX_POS = 2 << 29
+
cdef tuple FORMAT_CATEGORIES = ('UNKNOWN', 'ALIGNMENTS', 'VARIANTS', 'INDEX', 'REGIONS')
cdef tuple FORMATS = ('UNKNOWN', 'BINARY_FORMAT', 'TEXT_FORMAT', 'SAM', 'BAM', 'BAI', 'CRAM', 'CRAI',
'VCF', 'BCF', 'CSI', 'GZI', 'TBI', 'BED')
return hts_get_verbosity()
+cdef class HFile(object):
+ cdef hFILE *fp
+ cdef readonly object name, mode
+
+ def __init__(self, name, mode='r', closedf=True):
+ self._open(name, mode, closefd=True)
+
+ def __dealloc__(self):
+ self.close()
+
+ @property
+ def closed(self):
+ return self.fp == NULL
+
+ cdef _open(self, name, mode, closefd=True):
+ self.name = name
+ self.mode = mode
+
+ mode = force_bytes(mode)
+
+ if isinstance(name, int):
+ if self.fp != NULL:
+ name = dup(name)
+ self.fp = hdopen(name, mode)
+ else:
+ name = encode_filename(name)
+ self.fp = hopen(name, mode)
+
+ if not self.fp:
+ raise OSError(errno, 'failed to open HFile', self.name)
+
+ def close(self):
+ if self.fp == NULL:
+ return
+
+ cdef hFILE *fp = self.fp
+ self.fp = NULL
+
+ if hclose(fp) != 0:
+ raise OSError(herrno(self.fp), 'failed to close HFile', self.name)
+
+ def fileno(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+ if isinstance(self.name, int):
+ return self.name
+ else:
+ raise AttributeError('fileno not available')
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, tb):
+ self.close()
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ line = self.readline()
+ if not line:
+ raise StopIteration()
+ return line
+
+ def flush(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+ if hflush(self.fp) != 0:
+ raise OSError(herrno(self.fp), 'failed to flush HFile', self.name)
+
+ def isatty(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+ return False
+
+ def readable(self):
+ return self.fp != NULL and 'r' in self.mode
+
+ def read(self, Py_ssize_t size=-1):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ if size == 0:
+ return b''
+
+ cdef list parts = []
+ cdef bytes part
+ cdef Py_ssize_t chunk_size, ret, bytes_read = 0
+ cdef char *cpart
+
+ while size == -1 or bytes_read < size:
+ chunk_size = 4096
+ if size != -1:
+ chunk_size = min(chunk_size, size - bytes_read)
+
+ part = PyBytes_FromStringAndSize(NULL, chunk_size)
+ cpart = <char *>part
+ ret = hread(self.fp, <void *>cpart, chunk_size)
+
+ if ret < 0:
+ OSError(herrno(self.fp), 'failed to read HFile', self.name)
+ elif not ret:
+ break
+
+ bytes_read += ret
+
+ if ret < chunk_size:
+ part = cpart[:ret]
+
+ parts.append(part)
+
+ return b''.join(parts)
+
+ def readall(self):
+ return self.read()
+
+ def readinto(self, buf):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ size = len(buf)
+
+ if size == 0:
+ return size
+
+ mv = memoryview(buf)
+ ret = hread(self.fp, <void *>mv, size)
+
+ if ret < 0:
+ OSError(herrno(self.fp), 'failed to read HFile', self.name)
+
+ return ret
+
+ def readline(self, Py_ssize_t size=-1):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ if size == 0:
+ return b''
+
+ cdef list parts = []
+ cdef bytes part
+ cdef Py_ssize_t chunk_size, ret, bytes_read = 0
+ cdef char *cpart
+
+ while size == -1 or bytes_read < size:
+ chunk_size = 4096
+ if size != -1:
+ chunk_size = min(chunk_size, size - bytes_read)
+
+ part = PyBytes_FromStringAndSize(NULL, chunk_size)
+ cpart = <char *>part
+
+ # Python bytes objects allocate an extra byte for a null terminator
+ ret = hgetln(cpart, chunk_size+1, self.fp)
+
+ if ret < 0:
+ OSError(herrno(self.fp), 'failed to read HFile', self.name)
+ elif not ret:
+ break
+
+ bytes_read += ret
+
+ if ret < chunk_size:
+ part = cpart[:ret]
+ cpart = <char *>part
+
+ parts.append(part)
+
+ if cpart[ret-1] == b'\n':
+ break
+
+ return b''.join(parts)
+
+ def readlines(self):
+ return list(self)
+
+ def seek(self, Py_ssize_t offset, int whence=SEEK_SET):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ cdef Py_ssize_t off = hseek(self.fp, offset, whence)
+
+ if off < 0:
+ raise OSError(herrno(self.fp), 'seek failed on HFile', self.name)
+
+ return off
+
+ def tell(self):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ ret = htell(self.fp)
+
+ if ret < 0:
+ raise OSError(herrno(self.fp), 'tell failed on HFile', self.name)
+
+ return ret
+
+ def seekable(self):
+ return self.fp != NULL
+
+ def truncate(self, size=None):
+ raise NotImplementedError()
+
+ def writable(self):
+ return self.fp != NULL and 'w' in self.mode
+
+ def write(self, bytes b):
+ if self.fp == NULL:
+ raise OSError('operation on closed HFile')
+
+ got = hwrite(self.fp, <void *>b, len(b))
+
+ if got < 0:
+ raise OSError(herrno(self.fp), 'write failed on HFile', self.name)
+
+ return got
+
+ def writelines(self, lines):
+ for line in lines:
+ self.write(line)
+
+
class CallableValue(object):
def __init__(self, value):
self.value = value
self.htsfile = NULL
self.duplicate_filehandle = True
+ def close(self):
+ if self.htsfile:
+ hts_close(self.htsfile)
+ self.htsfile = NULL
+
def __dealloc__(self):
if self.htsfile:
hts_close(self.htsfile)
self.htsfile = NULL
+ def check_truncation(self, ignore_truncation=False):
+ """Check if file is truncated."""
+ if not self.htsfile:
+ return
+
+ if self.htsfile.format.compression != bgzf:
+ return
+
+ cdef BGZF *bgzfp = hts_get_bgzfp(self.htsfile)
+ if not bgzfp:
+ return
+
+ cdef int ret = bgzf_check_EOF(bgzfp)
+ if ret < 0:
+ raise OSError(errno, 'error checking for EOF marker')
+ elif ret == 0:
+ msg = 'no BGZF EOF marker; file may be truncated'.format(self.filename)
+ if ignore_truncation:
+ warn(msg)
+ else:
+ raise OSError(msg)
+
def __enter__(self):
return self
raise OSError('seek not available in streams')
cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
+ if self.htsfile.format.compression == bgzf:
with nogil:
ret = bgzf_seek(hts_get_bgzfp(self.htsfile), offset, SEEK_SET)
- else:
+ elif self.htsfile.format.compression == no_compression:
with nogil:
ret = hts_useek(self.htsfile, <int>offset, SEEK_SET)
+ else:
+ raise NotImplementedError("seek not implemented in files compressed by method {}".format(
+ self.htsfile.format.compression))
return ret
def tell(self):
raise OSError('tell not available in streams')
cdef int64_t ret
- if self.htsfile.format.compression != no_compression:
+ if self.htsfile.format.compression == bgzf:
with nogil:
ret = bgzf_tell(hts_get_bgzfp(self.htsfile))
- else:
+ elif self.htsfile.format.compression == no_compression:
with nogil:
ret = hts_utell(self.htsfile)
+ elif self.htsfile.format.format == cram:
+ with nogil:
+ ret = htell(cram_fd_get_fp(self.htsfile.fp.cram))
+ else:
+ raise NotImplementedError("seek not implemented in files compressed by method {}".format(
+ self.htsfile.format.compression))
+
return ret
cdef htsFile *_open_htsfile(self) except? NULL:
fd = self.filename
else:
fd = self.filename.fileno()
-
+
if self.duplicate_filehandle:
dup_fd = dup(fd)
else:
--- /dev/null
+cdef extern from "csamtools_util.h":
+
+ int samtools_main(int argc, char *argv[])
--- /dev/null
+def py_samtools():
+ pass
pass
+cdef class asGFF3(Parser):
+ pass
+
+
cdef class asBed(Parser):
pass
# class TabixFile class wrapping tabix indexed files in bgzf format
#
# class asTuple Parser class for tuples
-# class asGT Parser class for GTF formatted rows
+# class asGTF Parser class for GTF formatted rows
+# class asGFF3 Parser class for GFF3 formatted rows
# class asBed Parser class for Bed formatted rows
# class asVCF Parser class for VCF formatted rows
#
return r
+cdef class asGFF3(Parser):
+ '''converts a :term:`tabix row` into a GFF record with the following
+ fields:
+
+ +----------+----------+-------------------------------+
+ |*Column* |*Name* |*Content* |
+ +----------+----------+-------------------------------+
+ |1 |contig |the chromosome name |
+ +----------+----------+-------------------------------+
+ |2 |feature |The feature type |
+ +----------+----------+-------------------------------+
+ |3 |source |The feature source |
+ +----------+----------+-------------------------------+
+ |4 |start |genomic start coordinate |
+ | | |(0-based) |
+ +----------+----------+-------------------------------+
+ |5 |end |genomic end coordinate |
+ | | |(0-based) |
+ +----------+----------+-------------------------------+
+ |6 |score |feature score |
+ +----------+----------+-------------------------------+
+ |7 |strand |strand |
+ +----------+----------+-------------------------------+
+ |8 |frame |frame |
+ +----------+----------+-------------------------------+
+ |9 |attributes|the attribute field |
+ +----------+----------+-------------------------------+
+
+ '''
+ cdef parse(self, char * buffer, int len):
+ cdef ctabixproxies.GFF3Proxy r
+ r = ctabixproxies.GFF3Proxy(self.encoding)
+ r.copy(buffer, len)
+ return r
+
+
cdef class asGTF(Parser):
'''converts a :term:`tabix row` into a GTF record with the following
fields:
r = ctabixproxies.GTFProxy(self.encoding)
r.copy(buffer, len)
return r
-
+
cdef class asBed(Parser):
'''converts a :term:`tabix row` into a bed record
"Tabixfile",
"asTuple",
"asGTF",
+ "asGFF3",
"asVCF",
"asBed",
"GZIterator",
cdef copy(self, char * buffer, size_t nbytes, bint reset=*)
cdef update(self, char * buffer, size_t nbytes)
-cdef class GTFProxy(TupleProxy) :
- cdef:
- char * _attributes
- cdef bint hasOwnAttributes
+cdef class NamedTupleProxy(TupleProxy):
+ pass
+
+cdef class GTFProxy(NamedTupleProxy):
+ cdef object attribute_dict
cpdef int getMaxFields(self)
cpdef int getMinFields(self)
- cdef char * getAttributes(self)
-cdef class NamedTupleProxy(TupleProxy):
+
+cdef class GFF3Proxy(GTFProxy):
pass
+
cdef class BedProxy(NamedTupleProxy):
cdef:
import collections
+
cdef char *StrOrEmpty(char * buffer):
if buffer == NULL:
return ""
else: return buffer
+
cdef int isNew(char * p, char * buffer, size_t nbytes):
"""return True if `p` is located within `buffer` of size
`nbytes`
"""
if p == NULL:
return 0
- return not (buffer <= p < buffer + nbytes)
+
+ return not (buffer <= p <= buffer + nbytes)
cdef class TupleProxy:
self.nfields = field
if self.nfields < self.getMinFields():
raise ValueError(
- "parsing error: fewer that %i fields in line: %s" %
+ "parsing error: fewer than %i fields in line: %s" %
(self.getMinFields(), buffer))
def _getindex(self, int index):
raise IndexError("list index out of range")
if isNew(self.fields[idx], self.data, self.nbytes):
- free(self.fields[idx] )
+ free(self.fields[idx])
self.is_modified = 1
return str(v)
-cdef class GTFProxy(TupleProxy):
+cdef class NamedTupleProxy(TupleProxy):
+
+ map_key2field = {}
+
+ def __setattr__(self, key, value):
+ '''set attribute.'''
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ if self.nfields < idx:
+ raise KeyError("field %s not set" % key)
+ TupleProxy.__setitem__(self, idx, str(value))
+
+ def __getattr__(self, key):
+ cdef int idx
+ idx, f = self.map_key2field[key]
+ if self.nfields < idx:
+ raise KeyError("field %s not set" % key)
+ if f == str:
+ return force_str(self.fields[idx],
+ self.encoding)
+ return f(self.fields[idx])
+
+
+cdef dot_or_float(v):
+ if v == "" or v == b".":
+ return None
+ else:
+ try:
+ return int(v)
+ except ValueError:
+ return float(v)
+
+
+cdef dot_or_int(v):
+ if v == "" or v == b".":
+ return None
+ else:
+ return int(v)
+
+
+cdef dot_or_str(v):
+ if v == "" or v == b".":
+ return None
+ else:
+ return force_str(v)
+
+
+cdef int from1based(v):
+ return atoi(v) - 1
+
+
+cdef str to1based(int v):
+ return str(v + 1)
+
+
+cdef class GTFProxy(NamedTupleProxy):
'''Proxy class for access to GTF fields.
This class represents a GTF entry for fast read-access.
The only exception is the attributes field when set from
a dictionary - this field will manage its own memory.
+
'''
+ separator = "; "
+ # first value is field index, the tuple contains conversion
+ # functions for getting (converting internal string representation
+ # to pythonic value) and setting (converting pythonic value to
+ # interval string representation)
+ map_key2field = {
+ 'contig' : (0, (str, str)),
+ 'source' : (1, (dot_or_str, str)),
+ 'feature': (2, (dot_or_str, str)),
+ 'start' : (3, (from1based, to1based)),
+ 'end' : (4, (int, int)),
+ 'score' : (5, (dot_or_float, toDot)),
+ 'strand' : (6, (dot_or_str, str)),
+ 'frame' : (7, (dot_or_int, toDot)),
+ 'attributes': (8, (str, str))}
+
def __cinit__(self):
# automatically calls TupleProxy.__cinit__
- self.hasOwnAttributes = False
- self._attributes = NULL
-
- def __dealloc__(self):
- # automatically calls TupleProxy.__dealloc__
- if self.hasOwnAttributes:
- free(self._attributes)
-
+ self.attribute_dict = None
+
cpdef int getMinFields(self):
'''return minimum number of fields.'''
return 9
'''return max number of fields.'''
return 9
- property contig:
- '''contig of feature.'''
- def __get__(self):
- return self._getindex(0)
- def __set__(self, value):
- self._setindex(0, value)
-
- property source:
- '''feature source.'''
- def __get__(self):
- return self._getindex(1)
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(1, value)
-
- property feature:
- '''feature name.'''
- def __get__(self):
- return self._getindex(2)
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(2, value)
-
- property start:
- '''feature start (in 0-based open/closed coordinates).'''
- def __get__(self ):
- return int( self._getindex(3)) - 1
- def __set__(self, value ):
- self._setindex(3, str(value+1))
-
- property end:
- '''feature end (in 0-based open/closed coordinates).'''
- def __get__(self):
- return int(self._getindex(4))
- def __set__(self, value):
- self._setindex(4, str(value))
-
- property score:
- '''feature score.'''
- def __get__(self):
- v = self._getindex(5)
- if v == "" or v[0] == '.':
- return None
- else:
- return float(v)
-
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(5, str(value))
-
- property strand:
- '''feature strand.'''
- def __get__(self):
- return self._getindex(6)
- def __set__(self, value ):
- if value is None:
- value = "."
- self._setindex(6, value)
-
- property frame:
- '''feature frame.'''
- def __get__(self):
- v = self._getindex(7)
- if v == "" or v[0] == '.':
- return v
- else:
- return int(v)
-
- def __set__(self, value):
- if value is None:
- value = "."
- self._setindex(7, str(value))
-
- property attributes:
- '''feature attributes (as a string).'''
- def __get__(self):
- if self.hasOwnAttributes:
- return force_str(self._attributes)
- else:
- return force_str(self._getindex(8))
- def __set__( self, value):
- if self.hasOwnAttributes:
- free(self._attributes)
- self._attributes = NULL
- self.hasOwnAttributes = False
- self._setindex(8, value)
-
- cdef char * getAttributes(self):
- '''return pointer to attributes.'''
- cdef char * attributes
- if self.hasOwnAttributes:
- attributes = self._attributes
- else:
- attributes = self.fields[8]
- if attributes == NULL:
- raise KeyError("no attributes defined GTF entry")
- return attributes
-
def asDict(self):
"""parse attributes - return as dict
"""
-
- # remove comments
- attributes = self.attributes
-
- # separate into fields
- # Fields might contain a ";", for example in ENSEMBL GTF file
- # for mouse, v78:
- # ...; transcript_name "TXNRD2;-001"; ....
- # The current heuristic is to split on a semicolon followed by a
- # space, see also http://mblab.wustl.edu/GTF22.html
-
- # Remove white space to prevent a last empty field.
- fields = [x.strip() for x in attributes.strip().split("; ")]
-
- result = collections.OrderedDict()
-
- for f in fields:
-
- # strip semicolon (GTF files without a space after the last semicolon)
- if f.endswith(";"):
- f = f[:-1]
-
- # split at most once in order to avoid separating
- # multi-word values
- d = [x.strip() for x in f.split(" ", 1)]
-
- n,v = d[0], d[1]
- if len(d) > 2:
- v = d[1:]
-
- if v[0] == '"' and v[-1] == '"':
- v = v[1:-1]
- else:
- ## try to convert to a value
- try:
- v = float(v)
- v = int(v)
- except ValueError:
- pass
- except TypeError:
- pass
-
- result[n] = v
-
- return result
+ return collections.OrderedDict(self.attribute_iterator())
def fromDict(self, d):
'''set attributes from a dictionary.'''
- cdef char * p
- cdef int l
-
- # clean up if this field is set twice
- if self.hasOwnAttributes:
- free(self._attributes)
-
- aa = []
- for k,v in d.items():
- if isinstance(v, str):
- aa.append( '%s "%s"' % (k,v) )
- else:
- aa.append( '%s %s' % (k,str(v)) )
-
- a = force_bytes("; ".join(aa) + ";")
- p = a
- l = len(a)
- self._attributes = <char *>calloc(l + 1, sizeof(char))
- if self._attributes == NULL:
- raise ValueError("out of memory")
- memcpy(self._attributes, p, l)
-
- self.hasOwnAttributes = True
- self.is_modified = True
+ self.attribute_dict = None
+ attribute_string = force_bytes(
+ self.attribute_dict2string(d),
+ self.encoding)
+ self._setindex(8, attribute_string)
def __str__(self):
cdef char * cpy
if self.is_modified:
return "\t".join(
(self.contig,
- self.source,
- self.feature,
- str(self.start+1),
+ toDot(self.source),
+ toDot(self.feature),
+ str(self.start + 1),
str(self.end),
toDot(self.score),
toDot(self.strand),
def keys(self):
'''return a list of attributes defined in this entry.'''
- r = self.attributes
- return [x.strip().split(" ")[0]
- # separator is ';' followed by space
- for x in r.split("; ") if x.strip() != '']
+ if not self.attribute_dict:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ return self.attribute_dict.keys()
def __getitem__(self, key):
return self.__getattr__(key)
- def __getattr__(self, item):
- """Generic lookup of attribute from GFF/GTF attributes
- Only called if there *isn't* an attribute with this name
- """
- cdef char * start
- cdef char * query
- cdef char * cpy
- cdef char * end
- cdef int l
-
- #
- # important to use the getAttributes function.
- # Using the self.attributes property to access
- # the attributes caused a hard-to-trace bug
- # in which fields in the attribute string were
- # set to 0.
- # Running through valgrind complained that
- # memory was accessed in the memory field
- # that has been released. It is not clear
- # why this happened and might be a cython bug
- # (Version 0.16). The valgrind warnings
- # disappeard after accessing the C data structures
- # directly and so did the bug.
- cdef char * attributes = self.getAttributes()
- if attributes == NULL:
- raise KeyError("key %s not found, no attributes" % item)
-
- # add space in order to make sure
- # to not pick up a field that is a prefix of another field
- r = force_bytes(item + " ")
- query = r
- start = strstr(attributes, query)
-
- if start == NULL:
- raise AttributeError("'GTFProxy' has no attribute '%s'" % item)
-
- start += strlen(query)
- # skip gaps before
- while start[0] == ' ':
- start += 1
-
- if start[0] == '"':
- start += 1
- end = start
- while end[0] != '\0' and end[0] != '"':
- end += 1
- l = end - start
- result = force_str(PyBytes_FromStringAndSize(start, l),
- self.encoding)
- return result
- else:
- return force_str(start, self.encoding)
-
def setAttribute(self, name, value):
- '''convenience method to set an attribute.'''
- r = self.asDict()
- r[name] = value
- self.fromDict(r)
-
+ '''convenience method to set an attribute.
+ '''
+ if not self.attribute_dict:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ self.attribute_dict[name] = value
+
+ def attribute_string2dict(self, s):
+ return collections.OrderedDict(
+ self.attribute_string2iterator(s))
+
def __cmp__(self, other):
return (self.contig, self.strand, self.start) < \
(other.contig, other.strand, other.start)
err_msg = "op {0} isn't implemented yet".format(op)
raise NotImplementedError(err_msg)
+ def dict2attribute_string(self, d):
+ """convert dictionary to attribute string in GTF format.
-cdef class NamedTupleProxy(TupleProxy):
+ """
+ aa = []
+ for k, v in d.items():
+ if isinstance(v, str):
+ aa.append('{} "{}"'.format(k, v))
+ else:
+ aa.append("{} {}".format(k, str(v)))
- map_key2field = {}
+ return self.separator.join(aa) + ";"
+
+ def attribute_string2iterator(self, s):
+ """convert attribute string in GTF format to records
+ and iterate over key, value pairs.
+ """
+
+ # remove comments
+ attributes = force_str(s, encoding=self.encoding)
+
+ # separate into fields
+ # Fields might contain a ";", for example in ENSEMBL GTF file
+ # for mouse, v78:
+ # ...; transcript_name "TXNRD2;-001"; ....
+ # The current heuristic is to split on a semicolon followed by a
+ # space, see also http://mblab.wustl.edu/GTF22.html
+
+ # Remove white space to prevent a last empty field.
+ fields = [x.strip() for x in attributes.strip().split("; ")]
+ for f in fields:
+
+ # strip semicolon (GTF files without a space after the last semicolon)
+ if f.endswith(";"):
+ f = f[:-1]
+
+ # split at most once in order to avoid separating
+ # multi-word values
+ d = [x.strip() for x in f.split(" ", 1)]
+
+ n, v = d[0], d[1]
+ if len(d) > 2:
+ v = d[1:]
+
+ if v[0] == '"' and v[-1] == '"':
+ v = v[1:-1]
+ else:
+ ## try to convert to a value
+ try:
+ v = float(v)
+ v = int(v)
+ except ValueError:
+ pass
+ except TypeError:
+ pass
+
+ yield n, v
+
+ def __getattr__(self, key):
+ """Generic lookup of attribute from GFF/GTF attributes
+ """
+
+ # Only called if there *isn't* an attribute with this name
+ cdef int idx
+ idx, f = self.map_key2field.get(key, (-1, None))
+ if idx >= 0:
+ # deal with known attributes (fields 0-8)
+ if idx == 8:
+ # flush attributes if requested
+ if self.is_modified and self.attribute_dict is not None:
+ s = self.dict2attribute_string(self.attribute_dict)
+ TupleProxy._setindex(self, idx, s)
+ self.attribute_dict = None
+ return s
+
+ if f[0] == str:
+ return force_str(self.fields[idx],
+ self.encoding)
+ else:
+ return f[0](self.fields[idx])
+ else:
+ # deal with generic attributes (gene_id, ...)
+ if self.attribute_dict is None:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ return self.attribute_dict[key]
def __setattr__(self, key, value):
'''set attribute.'''
- cdef int idx
- idx, f = self.map_key2field[key]
- if self.nfields < idx:
- raise KeyError("field %s not set" % key)
- TupleProxy.__setitem__(self, idx, str(value))
- def __getattr__(self, key):
+ # Note that __setattr__ is called before properties, so __setattr__ and
+ # properties don't mix well. This is different from __getattr__ which is
+ # called after any properties have been resolved.
cdef int idx
- idx, f = self.map_key2field[key]
- if self.nfields < idx:
- raise KeyError("field %s not set" % key)
- if f == str:
- return force_str(self.fields[idx],
- self.encoding)
- return f(self.fields[idx])
+ idx, f = self.map_key2field.get(key, (-1, None))
+
+ if idx >= 0:
+ if value is None:
+ s = "."
+ elif f[1] == str:
+ s = force_bytes(value,
+ self.encoding)
+ else:
+ s = str(f[1](value))
+ TupleProxy._setindex(self, idx, s)
+ else:
+ if self.attribute_dict is None:
+ self.attribute_dict = self.attribute_string2dict(
+ self.attributes)
+ self.attribute_dict[key] = value
+ self.is_modified = True
+
+
+cdef class GFF3Proxy(GTFProxy):
+
+ def dict2attribute_string(self, d):
+ """convert dictionary to attribute string."""
+ return ";".join(["{}={}".format(k, v) for k, v in d.items()])
+
+ def attribute_string2iterator(self, s):
+ """convert attribute string in GFF3 format to records
+ and iterate over key, value pairs.
+ """
+
+ for f in (x.strip() for x in s.split(";")):
+ if not f:
+ continue
+ key, value = f.split("=", 1)
+ value = value.strip()
+
+ ## try to convert to a value
+ try:
+ value = float(value)
+ value = int(value)
+ except ValueError:
+ pass
+ except TypeError:
+ pass
+
+ yield key.strip(), value
+
cdef class BedProxy(NamedTupleProxy):
'''Proxy class for access to Bed fields.
self.nfields = save_fields
return retval
- def __setattr__(self, key, value ):
+ def __setattr__(self, key, value):
'''set attribute.'''
if key == "start":
self.start = value
cdef int idx
idx, f = self.map_key2field[key]
- TupleProxy._setindex(self, idx, str(value) )
+ TupleProxy._setindex(self, idx, str(value))
+
cdef class VCFProxy(NamedTupleProxy):
'''Proxy class for access to VCF fields.
cdef extern from "pysam_util.h":
- int samtools_main(int argc, char *argv[])
- int bcftools_main(int argc, char *argv[])
void pysam_set_stderr(int fd)
void pysam_unset_stderr()
void pysam_set_stdout(int fd)
void pysam_set_stdout_fn(const char *)
void pysam_unset_stdout()
void set_optind(int)
+ extern int samtools_main(int argc, char *argv[])
+ extern int bcftools_main(int argc, char *argv[])
from libc.stdio cimport stdout as c_stdout
from posix.fcntl cimport open as c_open, O_WRONLY
+from libcbcftools cimport bcftools_main
+from libcsamtools cimport samtools_main
+
#####################################################################
# hard-coded constants
cdef int MAX_POS = 2 << 29
method,
args=None,
catch_stdout=True,
+ is_usage=False,
save_stdout=None):
'''call ``method`` in samtools/bcftools providing arguments in args.
+ By default, stdout is redirected to a temporary file using the patched
+ C sources except for a few commands that have an explicit output option
+ (typically: -o). In these commands (such as samtools view), this explicit
+ option is used. If *is_usage* is True, then these explicit output options
+ will not be used.
+
Catching of stdout can be turned off by setting *catch_stdout* to
False.
-
'''
if method == "index":
- if not os.path.exists(args[0]):
+ if args and not os.path.exists(args[0]):
raise IOError("No such file or directory: '%s'" % args[0])
if args is None:
pysam_set_stdout(stdout_h)
elif catch_stdout:
stdout_h, stdout_f = tempfile.mkstemp()
-
MAP_STDOUT_OPTIONS = {
- "samtools": {
- "view": "-o {}",
- "mpileup": "-o {}",
- "depad": "-o {}",
- "calmd": "", # uses pysam_stdout_fn
- },
+ "samtools": {
+ "view": "-o {}",
+ "mpileup": "-o {}",
+ "depad": "-o {}",
+ "calmd": "", # uses pysam_stdout_fn
+ },
"bcftools": {}
}
-
+
stdout_option = None
if collection == "bcftools":
# in bcftools, most methods accept -o, the exceptions
if not(method == "view" and "-c" in args):
stdout_option = MAP_STDOUT_OPTIONS[collection][method]
- if stdout_option is not None:
+ if stdout_option is not None and not is_usage:
os.close(stdout_h)
pysam_set_stdout_fn(force_bytes(stdout_f))
args.extend(stdout_option.format(stdout_f).split(" "))
#include <assert.h>
#include <unistd.h>
#include <stdio.h>
-#include "bam.h"
-#include "bam_endian.h"
+
+/* #include "bam.h" */
+/* #include "bam_endian.h" */
+
#include "htslib/khash.h"
#include "htslib/ksort.h"
#include "htslib/knetfile.h"
void set_optind(int);
+extern int samtools_main(int argc, char *argv[]);
+
+extern int bcftools_main(int argc, char *argv[]);
+
#endif
#include "samfile_util.h"
#include "htslib/sam.h"
-#include "kprobaln.h"
-
// taken from bam_md.c
// replace bam1_{qual,seq,cigar} with bam_get_{qual,seq,cigar}
// bam1_seqi -> bam_seqi
char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
-{
- uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int z = y + j;
- int c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
- if (ref[x+j] == 0) break; // out of boundary
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (ref[x+j] == 0) break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
- return (int)(t + .499);
-}
-
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (ref[i] == 0) { xe = i; break; }
- r[i-xb] = bam_nt16_nt4_table[seq_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, 1);
-}
#include "htslib/sam.h"
-int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
-int bam_prob_realn(bam1_t *b, const char *ref);
-
#endif
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
+#include <string.h>
#if !(_POSIX_C_SOURCE >= 200809L || _XOPEN_SOURCE >= 700)
/*
def usage(self):
'''return the samtools usage information for this command'''
- retval, stderr, stdout = csamtools._samtools_dispatch(
- self.dispatch)
- return stderr
+ retval, stderr, stdout = _pysam_dispatch(
+ self.collection,
+ self.dispatch,
+ is_usage=True,
+ catch_stdout=True)
+ # some tools write usage to stderr, such as mpileup
+ if stderr:
+ return stderr
+ else:
+ return stdout
# pysam versioning information
+__version__ = "0.11.2.2"
-__version__ = "0.10.0"
+# TODO: upgrade number
+__samtools_version__ = "1.4.1"
-__samtools_version__ = "1.3.1"
+# TODO: upgrade code and number
+__bcftools_version__ = "1.4.1"
-__bcftools_version__ = "1.3.1"
-
-__htslib_version__ = "1.3.2"
+__htslib_version__ = "1.4.1"
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "1.3.1"
+#define BAM_VERSION "1.4.1"
#include <stdint.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <float.h>
+#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include <htslib/kfunc.h>
#include "bam2bcf.h"
-#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
#include <stdint.h>
#include <assert.h>
#include <float.h>
+#include <htslib/hts.h>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include <htslib/kfunc.h>
#include "bam2bcf.h"
-#include "errmod.h"
extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
#define BAM2BCF_H
#include <stdint.h>
+#include <htslib/hts.h>
#include <htslib/vcf.h>
-#include "errmod.h"
/**
* A simplified version of Mann-Whitney U-test is calculated
#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "bam2bcf.h"
-#include "kprobaln.h"
#include "htslib/khash.h"
KHASH_SET_INIT_STR(rg)
bca->indelreg = 0;
for (t = 0; t < n_types; ++t) {
int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
apf1.bw = apf2.bw = abs(types[t]) + 3;
// compute indelreg
if (types[t] == 0) ir = 0;
if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
}
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
if (l > 255) l = 255;
score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499);
if (l > 255) l = 255;
score2[K*n_types + t] = sc<<8 | l;
}
free(ref2); free(query);
{ // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
for (s = K = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
//fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
}
free(score1); free(score2);
// free
#include <assert.h>
#include <ctype.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "bam2bcf.h"
-#include "kprobaln.h"
#include "htslib/khash.h"
KHASH_SET_INIT_STR(rg)
bca->indelreg = 0;
for (t = 0; t < n_types; ++t) {
int l, ir;
- kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ probaln_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
apf1.bw = apf2.bw = abs(types[t]) + 3;
// compute indelreg
if (types[t] == 0) ir = 0;
if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
}
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
if (l > 255) l = 255;
score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
if (sc > 5) {
- sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
- (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ sc = probaln_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
l = (int)(100. * sc / (qend - qbeg) + .499);
if (l > 255) l = 255;
score2[K*n_types + t] = sc<<8 | l;
}
free(ref2); free(query);
{ // compute indelQ
- int *sc, tmp, *sumq;
- sc = alloca(n_types * sizeof(int));
- sumq = alloca(n_types * sizeof(int));
- memset(sumq, 0, sizeof(int) * n_types);
+ int sc_a[16], sumq_a[16];
+ int tmp, *sc = sc_a, *sumq = sumq_a;
+ if (n_types > 16) {
+ sc = (int *)malloc(n_types * sizeof(int));
+ sumq = (int *)malloc(n_types * sizeof(int));
+ }
+ memset(sumq, 0, n_types * sizeof(int));
for (s = K = 0; s < n; ++s) {
for (i = 0; i < n_plp[s]; ++i, ++K) {
bam_pileup1_t *p = plp[s] + i;
//fprintf(pysam_stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d seqQ=%d indelQ=%d\n", pos, s, i, bam1_qname(p->b), (p->aux>>16)&0x3f, bca->indel_types[(p->aux>>16)&0x3f], (p->aux>>8)&0xff, p->aux&0xff);
}
}
+
+ if (sc != sc_a) free(sc);
+ if (sumq != sumq_a) free(sumq);
}
free(score1); free(score2);
// free
fprintf(stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
fprintf(stderr, " -b <bed> list of positions or regions\n");
fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
fprintf(stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
- fprintf(stderr, " -q <int> base quality threshold\n");
- fprintf(stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(stderr, " -q <int> base quality threshold [0]\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(stderr, " -r <chr:from-to> region\n");
- sam_global_opt_help(stderr, "-.--.");
+ sam_global_opt_help(stderr, "-.--.-");
fprintf(stderr, "\n");
fprintf(stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
int main_depth(int argc, char *argv[])
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- beg = 0; end = INT_MAX; // set the default region
+ reg_tid = 0; beg = 0; end = INT_MAX; // set the default region
for (i = 0; i < n; ++i) {
int rf;
data[i] = calloc(1, sizeof(aux_t));
if (reg) {
beg = data[0]->iter->beg; // and to the parsed region coordinates
end = data[0]->iter->end;
+ reg_tid = data[0]->iter->tid;
}
// the core multi-pileup loop
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (tid >= h->n_targets) continue; // diff number of @SQ lines per file?
- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
if (all) {
while (tid > last_tid) {
- if (last_tid >= 0 && all > 1 && !reg) {
- // Deal with remainder or entirety of last tid
+ if (last_tid >= 0 && !reg) {
+ // Deal with remainder or entirety of last tid.
while (++last_pos < h->target_len[last_tid]) {
+ // Horribly inefficient, but the bed API is an obfuscated black box.
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
continue;
fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1);
}
last_tid++;
last_pos = -1;
+ if (all < 2)
+ break;
}
// Deal with missing portion of current tid
last_tid = tid;
last_pos = pos;
}
+ if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue;
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
if (all) {
// Handle terminating region
- while (last_tid < h->n_targets) {
+ if (last_tid < 0 && reg && all > 1) {
+ last_tid = reg_tid;
+ last_pos = beg-1;
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
while (++last_pos < h->target_len[last_tid]) {
if (last_pos >= end) break;
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
fprintf(pysam_stderr, " -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n");
fprintf(pysam_stderr, " -b <bed> list of positions or regions\n");
fprintf(pysam_stderr, " -f <list> list of input BAM filenames, one per line [null]\n");
- fprintf(pysam_stderr, " -l <int> read length threshold (ignore reads shorter than <int>)\n");
+ fprintf(pysam_stderr, " -l <int> read length threshold (ignore reads shorter than <int>) [0]\n");
fprintf(pysam_stderr, " -d/-m <int> maximum coverage depth [8000]\n"); // the htslib's default
- fprintf(pysam_stderr, " -q <int> base quality threshold\n");
- fprintf(pysam_stderr, " -Q <int> mapping quality threshold\n");
+ fprintf(pysam_stderr, " -q <int> base quality threshold [0]\n");
+ fprintf(pysam_stderr, " -Q <int> mapping quality threshold [0]\n");
fprintf(pysam_stderr, " -r <chr:from-to> region\n");
- sam_global_opt_help(pysam_stderr, "-.--.");
+ sam_global_opt_help(pysam_stderr, "-.--.-");
fprintf(pysam_stderr, "\n");
fprintf(pysam_stderr, "The output is a simple tab-separated table with three columns: reference name,\n");
int main_depth(int argc, char *argv[])
{
- int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
+ int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
const bam_pileup1_t **plp;
char *reg = 0; // specified region
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
else
n = argc - optind; // the number of BAMs on the command line
data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
- beg = 0; end = INT_MAX; // set the default region
+ reg_tid = 0; beg = 0; end = INT_MAX; // set the default region
for (i = 0; i < n; ++i) {
int rf;
data[i] = calloc(1, sizeof(aux_t));
if (reg) {
beg = data[0]->iter->beg; // and to the parsed region coordinates
end = data[0]->iter->end;
+ reg_tid = data[0]->iter->tid;
}
// the core multi-pileup loop
while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (tid >= h->n_targets) continue; // diff number of @SQ lines per file?
- if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
if (all) {
while (tid > last_tid) {
- if (last_tid >= 0 && all > 1 && !reg) {
- // Deal with remainder or entirety of last tid
+ if (last_tid >= 0 && !reg) {
+ // Deal with remainder or entirety of last tid.
while (++last_pos < h->target_len[last_tid]) {
+ // Horribly inefficient, but the bed API is an obfuscated black box.
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
continue;
fputs(h->target_name[last_tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", last_pos+1);
}
last_tid++;
last_pos = -1;
+ if (all < 2)
+ break;
}
// Deal with missing portion of current tid
last_tid = tid;
last_pos = pos;
}
+ if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue;
fputs(h->target_name[tid], pysam_stdout); fprintf(pysam_stdout, "\t%d", pos+1); // a customized fprintf(pysam_stdout, ) would be faster
for (i = 0; i < n; ++i) { // base level filters have to go here
int j, m = 0;
if (all) {
// Handle terminating region
- while (last_tid < h->n_targets) {
+ if (last_tid < 0 && reg && all > 1) {
+ last_tid = reg_tid;
+ last_pos = beg-1;
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
while (++last_pos < h->target_len[last_tid]) {
if (last_pos >= end) break;
if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
/* bam_addrprg.c -- samtools command to add or replace readgroups.
- Copyright (c) 2013, 2015 Genome Research Limited.
+ Copyright (c) 2013, 2015, 2016 Genome Research Limited.
Author: Martin O. Pollard <mp15@sanger.ac.uk>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include <string.h>
#include <stdio.h>
char* rg_line;
rg_mode mode;
sam_global_args ga;
+ htsThreadPool p;
};
struct state;
free(opts->rg_id);
free(opts->output_name);
free(opts->input_name);
+ if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
sam_global_args_free(&opts->ga);
free(opts);
}
return tmp;
}
+// Malloc a string containing [s,slim) or to the end of s if slim is NULL.
+// If lenp is non-NULL, stores the length of the resulting string there.
+static char *dup_substring(const char *s, const char *slim, size_t *lenp)
+{
+ size_t len = slim? (slim - s) : strlen(s);
+ char *ns = malloc(len+1);
+ if (ns == NULL) return NULL;
+ memcpy(ns, s, len);
+ ns[len] = '\0';
+ if (lenp) *lenp = len;
+ return ns;
+}
+
// These are to be replaced by samtools header parser
// Extracts the first @RG line from a string.
static char* get_rg_line(const char* text, size_t* last)
rg++;//skip initial \n
}
// duplicate the line for return
- char* line;
- char* end = strchr(rg, '\n');
- if (end) {
- line = strndup(rg,(end-rg));
- *last = end - rg;
- } else {
- line = strdup(rg);
- *last = strlen(rg);
- }
- return line;
+ return dup_substring(rg, strchr(rg, '\n'), last);
}
// Given a @RG line return the id
-static char* get_rg_id(const char* input)
+static char* get_rg_id(const char *line)
{
- assert(input!=NULL);
- char* line = strdup(input);
- char *next = line;
- char* token = strsep(&next, "\t");
- token = strsep(&next,"\t"); // skip first token it should always be "@RG"
- while (next != NULL) {
- char* key = strsep(&token,":");
- if (!strcmp(key,"ID")) {
- char* retval = strdup(token);
- free(line);
- return retval;
- }
- token = strsep(&next,"\t");
- }
- free(line);
- return NULL;
+ const char *id = strstr(line, "\tID:");
+ if (! id) return NULL;
+
+ id += 4;
+ return dup_substring(id, strchr(id, '\t'), NULL);
}
// Confirms the existance of an RG line with a given ID in a bam header
{
assert( hdr != NULL && rgid != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
bool found = false;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == false ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
free(line);
ptr += end;
}
- free(start);
return found;
}
static char* get_first_rgid( const bam_hdr_t *hdr )
{
assert( hdr != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
char* found = NULL;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == NULL ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
free(line);
ptr += end;
}
- free(start);
return found;
}
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
);
- sam_global_opt_help(fp, "..O..");
+ sam_global_opt_help(fp, "..O..@");
}
static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
retval->mode = overwrite_all;
sam_global_args_init(&retval->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
kstring_t rg_line = {0,0,NULL};
- while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
switch (n) {
case 'r':
// Are we adding to existing rg line?
}
retval->input_name = strdup(argv[optind+0]);
+ if (retval->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ return false;
+ }
+ }
+
*opts = retval;
return true;
}
// Open files
retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
if (retval->input_file == NULL) {
- fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name);
+ print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
return false;
}
retval->input_header = sam_hdr_read(retval->input_file);
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
if (retval->output_file == NULL) {
- print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
+ print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
return false;
}
+ if (opts->p.pool) {
+ hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p);
+ hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
+ }
+
if (opts->rg_line) {
// Append new RG line to header.
// Check does not already exist
if (!readgroupise(state)) goto error;
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_SUCCESS;
error:
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_FAILURE;
}
/* bam_addrprg.c -- samtools command to add or replace readgroups.
- Copyright (c) 2013, 2015 Genome Research Limited.
+ Copyright (c) 2013, 2015, 2016 Genome Research Limited.
Author: Martin O. Pollard <mp15@sanger.ac.uk>
#include <htslib/sam.h>
#include <htslib/kstring.h>
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include <string.h>
#include <stdio.h>
char* rg_line;
rg_mode mode;
sam_global_args ga;
+ htsThreadPool p;
};
struct state;
free(opts->rg_id);
free(opts->output_name);
free(opts->input_name);
+ if (opts->p.pool) hts_tpool_destroy(opts->p.pool);
sam_global_args_free(&opts->ga);
free(opts);
}
return tmp;
}
+// Malloc a string containing [s,slim) or to the end of s if slim is NULL.
+// If lenp is non-NULL, stores the length of the resulting string there.
+static char *dup_substring(const char *s, const char *slim, size_t *lenp)
+{
+ size_t len = slim? (slim - s) : strlen(s);
+ char *ns = malloc(len+1);
+ if (ns == NULL) return NULL;
+ memcpy(ns, s, len);
+ ns[len] = '\0';
+ if (lenp) *lenp = len;
+ return ns;
+}
+
// These are to be replaced by samtools header parser
// Extracts the first @RG line from a string.
static char* get_rg_line(const char* text, size_t* last)
rg++;//skip initial \n
}
// duplicate the line for return
- char* line;
- char* end = strchr(rg, '\n');
- if (end) {
- line = strndup(rg,(end-rg));
- *last = end - rg;
- } else {
- line = strdup(rg);
- *last = strlen(rg);
- }
- return line;
+ return dup_substring(rg, strchr(rg, '\n'), last);
}
// Given a @RG line return the id
-static char* get_rg_id(const char* input)
+static char* get_rg_id(const char *line)
{
- assert(input!=NULL);
- char* line = strdup(input);
- char *next = line;
- char* token = strsep(&next, "\t");
- token = strsep(&next,"\t"); // skip first token it should always be "@RG"
- while (next != NULL) {
- char* key = strsep(&token,":");
- if (!strcmp(key,"ID")) {
- char* retval = strdup(token);
- free(line);
- return retval;
- }
- token = strsep(&next,"\t");
- }
- free(line);
- return NULL;
+ const char *id = strstr(line, "\tID:");
+ if (! id) return NULL;
+
+ id += 4;
+ return dup_substring(id, strchr(id, '\t'), NULL);
}
// Confirms the existance of an RG line with a given ID in a bam header
{
assert( hdr != NULL && rgid != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
bool found = false;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == false ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
free(line);
ptr += end;
}
- free(start);
return found;
}
static char* get_first_rgid( const bam_hdr_t *hdr )
{
assert( hdr != NULL );
- char *ptr, *start;
+ const char *ptr = hdr->text;
char* found = NULL;
- start = ptr = strndup(hdr->text, hdr->l_text);
while (ptr != NULL && *ptr != '\0' && found == NULL ) {
size_t end = 0;
char* line = get_rg_line(ptr, &end);
free(line);
ptr += end;
}
- free(start);
return found;
}
" -r STRING @RG line text\n"
" -R STRING ID of @RG line in existing header to use\n"
);
- sam_global_opt_help(fp, "..O..");
+ sam_global_opt_help(fp, "..O..@");
}
static bool parse_args(int argc, char** argv, parsed_opts_t** opts)
retval->mode = overwrite_all;
sam_global_args_init(&retval->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS(0, 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
kstring_t rg_line = {0,0,NULL};
- while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h", lopts, NULL)) >= 0) {
+ while ((n = getopt_long(argc, argv, "r:R:m:o:O:l:h@:", lopts, NULL)) >= 0) {
switch (n) {
case 'r':
// Are we adding to existing rg line?
}
retval->input_name = strdup(argv[optind+0]);
+ if (retval->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(retval->ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ return false;
+ }
+ }
+
*opts = retval;
return true;
}
// Open files
retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
if (retval->input_file == NULL) {
- fprintf(pysam_stderr, "[init] Could not open input file: %s\n", opts->input_name);
+ print_error_errno("addreplacerg", "could not open \"%s\"", opts->input_name);
return false;
}
retval->input_header = sam_hdr_read(retval->input_file);
retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);
if (retval->output_file == NULL) {
- print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
+ print_error_errno("addreplacerg", "could not create \"%s\"", opts->output_name);
return false;
}
+ if (opts->p.pool) {
+ hts_set_opt(retval->input_file, HTS_OPT_THREAD_POOL, &opts->p);
+ hts_set_opt(retval->output_file, HTS_OPT_THREAD_POOL, &opts->p);
+ }
+
if (opts->rg_line) {
// Append new RG line to header.
// Check does not already exist
if (!readgroupise(state)) goto error;
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_SUCCESS;
error:
- cleanup_opts(opts);
cleanup_state(state);
+ cleanup_opts(opts);
return EXIT_FAILURE;
}
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#include <strings.h>
#include "htslib/bgzf.h"
#include "htslib/sam.h"
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) != 0) goto write_fail;
}
{
bam_hdr_t *h = 0;
char *outfn = 0;
+ char **infns = NULL; // files to concatenate
+ int infns_size = 0;
int c, ret = 0;
samFile *in;
- while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+ while ((c = getopt(argc, argv, "h:o:b:")) >= 0) {
switch (c) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
break;
}
case 'o': outfn = strdup(optarg); break;
+ case 'b': {
+ // add file names in "optarg" to the list
+ // of files to concatenate
+ int nfns;
+ char **fns_read = hts_readlines(optarg, &nfns);
+ if (fns_read) {
+ infns = realloc(infns, (infns_size + nfns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns+infns_size, fns_read, nfns * sizeof(char*));
+ infns_size += nfns;
+ free(fns_read);
+ } else {
+ print_error("cat", "Invalid file list \"%s\"", optarg);
+ ret = 1;
+ }
+ break;
+ }
}
}
- if (argc - optind < 1) {
- fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+
+ // Append files specified in argv to the list.
+ int nargv_fns = argc - optind;
+ if (nargv_fns > 0) {
+ infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*));
+ }
+
+ // Require at least one input file
+ if (infns_size + nargv_fns == 0) {
+ fprintf(stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
+ fprintf(stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
+ fprintf(stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
+ fprintf(stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n");
+ fprintf(stderr, " -h FILE copy the header from FILE [default is 1st input file]\n");
+ fprintf(stderr, " -o FILE output BAM/CRAM\n");
return 1;
}
- in = sam_open(argv[optind], "r");
+ in = sam_open(infns[0], "r");
if (!in) {
- print_error_errno("cat", "failed to open file '%s'", argv[optind]);
+ print_error_errno("cat", "failed to open file '%s'", infns[0]);
return 1;
}
switch (hts_get_format(in)->format) {
case bam:
sam_close(in);
- if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
case cram:
sam_close(in);
- if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
fprintf(stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
+
+ end:
+ if (infns_size > 0) {
+ int i;
+ for (i=0; i<infns_size; i++)
+ free(infns[i]);
+ }
+
free(outfn);
+ free(infns);
if (h)
bam_hdr_destroy(h);
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
+#include <strings.h>
#include "htslib/bgzf.h"
#include "htslib/sam.h"
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) != 0) goto write_fail;
}
{
bam_hdr_t *h = 0;
char *outfn = 0;
+ char **infns = NULL; // files to concatenate
+ int infns_size = 0;
int c, ret = 0;
samFile *in;
- while ((c = getopt(argc, argv, "h:o:")) >= 0) {
+ while ((c = getopt(argc, argv, "h:o:b:")) >= 0) {
switch (c) {
case 'h': {
samFile *fph = sam_open(optarg, "r");
break;
}
case 'o': outfn = strdup(optarg); break;
+ case 'b': {
+ // add file names in "optarg" to the list
+ // of files to concatenate
+ int nfns;
+ char **fns_read = hts_readlines(optarg, &nfns);
+ if (fns_read) {
+ infns = realloc(infns, (infns_size + nfns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns+infns_size, fns_read, nfns * sizeof(char*));
+ infns_size += nfns;
+ free(fns_read);
+ } else {
+ print_error("cat", "Invalid file list \"%s\"", optarg);
+ ret = 1;
+ }
+ break;
+ }
}
}
- if (argc - optind < 1) {
- fprintf(pysam_stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> [...]\n");
+
+ // Append files specified in argv to the list.
+ int nargv_fns = argc - optind;
+ if (nargv_fns > 0) {
+ infns = realloc(infns, (infns_size + nargv_fns) * sizeof(char*));
+ if (infns == NULL) { ret = 1; goto end; }
+ memcpy(infns + infns_size, argv + optind, nargv_fns * sizeof(char*));
+ }
+
+ // Require at least one input file
+ if (infns_size + nargv_fns == 0) {
+ fprintf(pysam_stderr, "Usage: samtools cat [options] <in1.bam> [... <inN.bam>]\n");
+ fprintf(pysam_stderr, " samtools cat [options] <in1.cram> [... <inN.cram>]\n\n");
+ fprintf(pysam_stderr, "Concatenate BAM or CRAM files, first those in <bamlist.fofn>, then those\non the command line.\n\n");
+ fprintf(pysam_stderr, "Options: -b FILE list of input BAM/CRAM file names, one per line\n");
+ fprintf(pysam_stderr, " -h FILE copy the header from FILE [default is 1st input file]\n");
+ fprintf(pysam_stderr, " -o FILE output BAM/CRAM\n");
return 1;
}
- in = sam_open(argv[optind], "r");
+ in = sam_open(infns[0], "r");
if (!in) {
- print_error_errno("cat", "failed to open file '%s'", argv[optind]);
+ print_error_errno("cat", "failed to open file '%s'", infns[0]);
return 1;
}
switch (hts_get_format(in)->format) {
case bam:
sam_close(in);
- if (bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (bam_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
case cram:
sam_close(in);
- if (cram_cat(argc - optind, argv + optind, h, outfn? outfn : "-") < 0)
+ if (cram_cat(infns_size+nargv_fns, infns, h, outfn? outfn : "-") < 0)
ret = 1;
break;
fprintf(pysam_stderr, "[%s] ERROR: input is not BAM or CRAM\n", __func__);
return 1;
}
+
+ end:
+ if (infns_size > 0) {
+ int i;
+ for (i=0; i<infns_size; i++)
+ free(infns[i]);
+ }
+
free(outfn);
+ free(infns);
if (h)
bam_hdr_destroy(h);
"Options:\n"
" -b Generate BAI-format index for BAM files [default]\n"
" -c Generate CSI-format index for BAM files\n"
-" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n", BAM_LIDX_SHIFT);
+" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
+ int n_threads = 0;
int c, ret;
- while ((c = getopt(argc, argv, "bcm:")) >= 0)
+ while ((c = getopt(argc, argv, "bcm:@:")) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
+ case '@': n_threads = atoi(optarg); break;
default:
index_usage(stderr);
return 1;
return 1;
}
- ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0);
- if (ret != 0) {
- if (ret == -2)
- print_error_errno("index", "failed to open \"%s\"", argv[optind]);
- else if (ret == -3)
- print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads);
+ switch (ret) {
+ case 0:
+ return 0;
+
+ case -2:
+ print_error_errno("index", "failed to open \"%s\"", argv[optind]);
+ break;
+
+ case -3:
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ break;
+
+ case -4:
+ if (argv[optind+1])
+ print_error("index", "failed to create or write index \"%s\"", argv[optind+1]);
else
- print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]);
- return EXIT_FAILURE;
+ print_error("index", "failed to create or write index");
+ break;
+
+ default:
+ print_error_errno("index", "failed to create index for \"%s\"", argv[optind]);
+ break;
}
- return 0;
+ return EXIT_FAILURE;
}
int bam_idxstats(int argc, char *argv[])
return 1;
}
fp = sam_open(argv[1], "r");
- if (fp == NULL) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ if (fp == NULL) {
+ print_error_errno("idxstats", "failed to open \"%s\"", argv[1]);
+ return 1;
+ }
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(stderr, "[%s] failed to read header for '%s'.\n",
- __func__, argv[1]);
+ print_error("idxstats", "failed to read header for \"%s\"", argv[1]);
return 1;
}
idx = sam_index_load(fp, argv[1]);
- if (idx == NULL) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+ if (idx == NULL) {
+ print_error("idxstats", "fail to load index for \"%s\"", argv[1]);
+ return 1;
+ }
int i;
for (i = 0; i < header->n_targets; ++i) {
"Options:\n"
" -b Generate BAI-format index for BAM files [default]\n"
" -c Generate CSI-format index for BAM files\n"
-" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n", BAM_LIDX_SHIFT);
+" -m INT Set minimum interval size for CSI indices to 2^INT [%d]\n"
+" -@ INT Sets the number of threads [none]\n", BAM_LIDX_SHIFT);
}
int bam_index(int argc, char *argv[])
{
int csi = 0;
int min_shift = BAM_LIDX_SHIFT;
+ int n_threads = 0;
int c, ret;
- while ((c = getopt(argc, argv, "bcm:")) >= 0)
+ while ((c = getopt(argc, argv, "bcm:@:")) >= 0)
switch (c) {
case 'b': csi = 0; break;
case 'c': csi = 1; break;
case 'm': csi = 1; min_shift = atoi(optarg); break;
+ case '@': n_threads = atoi(optarg); break;
default:
index_usage(pysam_stderr);
return 1;
return 1;
}
- ret = sam_index_build2(argv[optind], argv[optind+1], csi? min_shift : 0);
- if (ret != 0) {
- if (ret == -2)
- print_error_errno("index", "failed to open \"%s\"", argv[optind]);
- else if (ret == -3)
- print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ ret = sam_index_build3(argv[optind], argv[optind+1], csi? min_shift : 0, n_threads);
+ switch (ret) {
+ case 0:
+ return 0;
+
+ case -2:
+ print_error_errno("index", "failed to open \"%s\"", argv[optind]);
+ break;
+
+ case -3:
+ print_error("index", "\"%s\" is in a format that cannot be usefully indexed", argv[optind]);
+ break;
+
+ case -4:
+ if (argv[optind+1])
+ print_error("index", "failed to create or write index \"%s\"", argv[optind+1]);
else
- print_error("index", "\"%s\" is corrupted or unsorted", argv[optind]);
- return EXIT_FAILURE;
+ print_error("index", "failed to create or write index");
+ break;
+
+ default:
+ print_error_errno("index", "failed to create index for \"%s\"", argv[optind]);
+ break;
}
- return 0;
+ return EXIT_FAILURE;
}
int bam_idxstats(int argc, char *argv[])
return 1;
}
fp = sam_open(argv[1], "r");
- if (fp == NULL) { fprintf(pysam_stderr, "[%s] fail to open BAM.\n", __func__); return 1; }
+ if (fp == NULL) {
+ print_error_errno("idxstats", "failed to open \"%s\"", argv[1]);
+ return 1;
+ }
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysam_stderr, "[%s] failed to read header for '%s'.\n",
- __func__, argv[1]);
+ print_error("idxstats", "failed to read header for \"%s\"", argv[1]);
return 1;
}
idx = sam_index_load(fp, argv[1]);
- if (idx == NULL) { fprintf(pysam_stderr, "[%s] fail to load the index.\n", __func__); return 1; }
+ if (idx == NULL) {
+ print_error("idxstats", "fail to load index for \"%s\"", argv[1]);
+ return 1;
+ }
int i;
for (i = 0; i < header->n_targets; ++i) {
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2016 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2017 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
return false;
}
-static void sync_mq(bam1_t* src, bam1_t* dest)
+// Returns 0 on success, -1 on failure.
+static int bam_format_cigar(const bam1_t* b, kstring_t* str)
+{
+ // An empty cigar is a special case return "*" rather than ""
+ if (b->core.n_cigar == 0) {
+ return (kputc('*', str) == EOF) ? -1 : 0;
+ }
+
+ const uint32_t *cigar = bam_get_cigar(b);
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; ++i) {
+ if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
+ if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
+ }
+
+ return 0;
+}
+
+// Returns 0 on success, -1 on failure.
+static int sync_mq_mc(bam1_t* src, bam1_t* dest)
{
if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
+ // Copy Mate Mapping Quality
uint32_t mq = src->core.qual;
uint8_t* data;
if ((data = bam_aux_get(dest,"MQ")) != NULL) {
bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
}
+ // Copy mate cigar if either read is mapped
+ if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
+ uint8_t* data_mc;
+ if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
+ bam_aux_del(dest, data_mc);
+ }
+
+ // Convert cigar to string
+ kstring_t mc = { 0, 0, NULL };
+ if (bam_format_cigar(src, &mc) < 0) return -1;
+
+ bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
+ free(mc.s);
+ }
+ return 0;
}
-// copy flags
-static void sync_mate(bam1_t* a, bam1_t* b)
+// Copy flags.
+// Returns 0 on success, -1 on failure.
+static int sync_mate(bam1_t* a, bam1_t* b)
{
sync_unmapped_pos_inner(a,b);
sync_unmapped_pos_inner(b,a);
sync_mate_inner(a,b);
sync_mate_inner(b,a);
- sync_mq(a,b);
- sync_mq(b,a);
+ if (sync_mq_mc(a,b) < 0) return -1;
+ if (sync_mq_mc(b,a) < 0) return -1;
+ return 0;
}
// currently, this function ONLY works if each read has one hit
if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
pre->core.flag |= BAM_FPAIRED;
cur->core.flag |= BAM_FPAIRED;
- sync_mate(pre, cur);
+ if (sync_mate(pre, cur)) goto fail;
if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
" -p Disable FR proper pair check\n"
" -c Add template cigar ct tag\n");
- sam_global_opt_help(where, "-.O..");
+ sam_global_opt_help(where, "-.O..@");
fprintf(where,
"\n"
int bam_mating(int argc, char *argv[])
{
+ htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
// parse args
if (argc == 1) { usage(stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
+
// run
res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
res = 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return res;
fail:
if (in) sam_close(in);
if (out) sam_close(out);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return 1;
}
/* bam_mate.c -- fix mate pairing information and clean up flags.
- Copyright (C) 2009, 2011-2016 Genome Research Ltd.
+ Copyright (C) 2009, 2011-2017 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Portions copyright (C) 2012 Peter Cock, The James Hutton Institute.
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
return false;
}
-static void sync_mq(bam1_t* src, bam1_t* dest)
+// Returns 0 on success, -1 on failure.
+static int bam_format_cigar(const bam1_t* b, kstring_t* str)
+{
+ // An empty cigar is a special case return "*" rather than ""
+ if (b->core.n_cigar == 0) {
+ return (kputc('*', str) == EOF) ? -1 : 0;
+ }
+
+ const uint32_t *cigar = bam_get_cigar(b);
+ uint32_t i;
+
+ for (i = 0; i < b->core.n_cigar; ++i) {
+ if (kputw(bam_cigar_oplen(cigar[i]), str) == EOF) return -1;
+ if (kputc(bam_cigar_opchr(cigar[i]), str) == EOF) return -1;
+ }
+
+ return 0;
+}
+
+// Returns 0 on success, -1 on failure.
+static int sync_mq_mc(bam1_t* src, bam1_t* dest)
{
if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
+ // Copy Mate Mapping Quality
uint32_t mq = src->core.qual;
uint8_t* data;
if ((data = bam_aux_get(dest,"MQ")) != NULL) {
bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
}
+ // Copy mate cigar if either read is mapped
+ if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
+ uint8_t* data_mc;
+ if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
+ bam_aux_del(dest, data_mc);
+ }
+
+ // Convert cigar to string
+ kstring_t mc = { 0, 0, NULL };
+ if (bam_format_cigar(src, &mc) < 0) return -1;
+
+ bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
+ free(mc.s);
+ }
+ return 0;
}
-// copy flags
-static void sync_mate(bam1_t* a, bam1_t* b)
+// Copy flags.
+// Returns 0 on success, -1 on failure.
+static int sync_mate(bam1_t* a, bam1_t* b)
{
sync_unmapped_pos_inner(a,b);
sync_unmapped_pos_inner(b,a);
sync_mate_inner(a,b);
sync_mate_inner(b,a);
- sync_mq(a,b);
- sync_mq(b,a);
+ if (sync_mq_mc(a,b) < 0) return -1;
+ if (sync_mq_mc(b,a) < 0) return -1;
+ return 0;
}
// currently, this function ONLY works if each read has one hit
if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
pre->core.flag |= BAM_FPAIRED;
cur->core.flag |= BAM_FPAIRED;
- sync_mate(pre, cur);
+ if (sync_mate(pre, cur)) goto fail;
if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
" -p Disable FR proper pair check\n"
" -c Add template cigar ct tag\n");
- sam_global_opt_help(where, "-.O..");
+ sam_global_opt_help(where, "-.O..@");
fprintf(where,
"\n"
int bam_mating(int argc, char *argv[])
{
+ htsThreadPool p = {NULL, 0};
samFile *in = NULL, *out = NULL;
int c, remove_reads = 0, proper_pair_check = 1, add_ct = 0, res = 1;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
char wmode[3] = {'w', 'b', 0};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
// parse args
if (argc == 1) { usage(pysam_stdout); return 0; }
- while ((c = getopt_long(argc, argv, "rpcO:", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "rpcO:@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': remove_reads = 1; break;
case 'p': proper_pair_check = 0; break;
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
+
// run
res = bam_mating_core(in, out, remove_reads, proper_pair_check, add_ct);
res = 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return res;
fail:
if (in) sam_close(in);
if (out) sam_close(out);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(&ga);
return 1;
}
#include <config.h>
-#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
-#include <math.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "kprobaln.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "samtools.h"
bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
}
-int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres)
-{
- uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (x+j >= ref_len || ref[x+j] == '\0') break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
- return (int)(t + .499);
-}
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
- return -1; // do nothing
-
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
- r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, INT_MAX, 1);
-}
-
int calmd_usage() {
fprintf(stderr,
"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
" -E extended BAQ for better sensitivity but lower specificity\n");
- sam_global_opt_help(stderr, "-....");
+ sam_global_opt_help(stderr, "-....@");
return 1;
}
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
bam_hdr_t *header = NULL;
faidx_t *fai = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
{ NULL, 0, NULL, 0 }
};
flt_flag = UPDATE_NM | UPDATE_MD;
is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
strcpy(mode_w, "w");
- while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p);
+ }
+
ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
fai = fai_load(ref_file);
if (is_realn || capQ > 10) goto fail; // Would otherwise crash
}
}
- if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
+ if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, len, capQ);
+ int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
fprintf(stderr, "[bam_fillmd] error when closing output file\n");
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 0;
fail:
if (fai) fai_destroy(fai);
if (fp) sam_close(fp);
if (fpout) sam_close(fpout);
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 1;
}
#include <config.h>
-#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>
-#include <math.h>
#include "htslib/faidx.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "kprobaln.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "samtools.h"
bam_fillmd1_core(b, ref, INT_MAX, flag, 0);
}
-int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres)
-{
- uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b);
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- int i, x, y, mm, q, len, clip_l, clip_q;
- double t;
- if (thres < 0) thres = 40; // set the default
- mm = q = len = clip_l = clip_q = 0;
- for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
- int j, l = cigar[i]>>4, op = cigar[i]&0xf;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (j = 0; j < l; ++j) {
- int c1, c2, z = y + j;
- if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
- c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
- if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
- ++len;
- if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
- ++mm;
- q += qual[z] > 33? 33 : qual[z];
- }
- }
- }
- if (j < l) break;
- x += l; y += l; len += l;
- } else if (op == BAM_CDEL) {
- for (j = 0; j < l; ++j)
- if (x+j >= ref_len || ref[x+j] == '\0') break;
- if (j < l) break;
- x += l;
- } else if (op == BAM_CSOFT_CLIP) {
- for (j = 0; j < l; ++j) clip_q += qual[y+j];
- clip_l += l;
- y += l;
- } else if (op == BAM_CHARD_CLIP) {
- clip_q += 13 * l;
- clip_l += l;
- } else if (op == BAM_CINS) y += l;
- else if (op == BAM_CREF_SKIP) x += l;
- }
- for (i = 0, t = 1; i < mm; ++i)
- t *= (double)len / (i+1);
- t = q - 4.343 * log(t) + clip_q / 5.;
- if (t > thres) return -1;
- if (t < 0) t = 0;
- t = sqrt((thres - t) / thres) * thres;
-// fprintf(pysam_stderr, "%s %lf %d\n", bam_get_qname(b), t, q);
- return (int)(t + .499);
-}
-
-int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag)
-{
- int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4;
- uint32_t *cigar = bam_get_cigar(b);
- bam1_core_t *c = &b->core;
- kpa_par_t conf = kpa_par_def;
- uint8_t *bq = 0, *zq = 0, *qual = bam_get_qual(b);
- if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1)
- return -1; // do nothing
-
- // test if BQ or ZQ is present
- if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
- if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
- if (bq && redo_baq)
- {
- bam_aux_del(b, bq-1);
- bq = 0;
- }
- if (bq && zq) { // remove the ZQ tag
- bam_aux_del(b, zq-1);
- zq = 0;
- }
- if (bq || zq) {
- if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
- if (bq && apply_baq) { // then convert BQ to ZQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
- *(bq - 3) = 'Z';
- } else if (zq && !apply_baq) { // then convert ZQ to BQ
- for (i = 0; i < c->l_qseq; ++i)
- qual[i] += (int)zq[i] - 64;
- *(zq - 3) = 'B';
- }
- return 0;
- }
- // find the start and end of the alignment
- x = c->pos, y = 0, yb = ye = xb = xe = -1;
- for (k = 0; k < c->n_cigar; ++k) {
- int op, l;
- op = cigar[k]&0xf; l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- if (yb < 0) yb = y;
- if (xb < 0) xb = x;
- ye = y + l; xe = x + l;
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
- }
- // set bandwidth and the start and the end
- bw = 7;
- if (abs((xe - xb) - (ye - yb)) > bw)
- bw = abs((xe - xb) - (ye - yb)) + 3;
- conf.bw = bw;
- xb -= yb + bw/2; if (xb < 0) xb = 0;
- xe += c->l_qseq - ye + bw/2;
- if (xe - xb - c->l_qseq > bw)
- xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
- { // glocal
- uint8_t *s, *r, *q, *seq = bam_get_seq(b), *bq;
- int *state;
- bq = calloc(c->l_qseq + 1, 1);
- memcpy(bq, qual, c->l_qseq);
- s = calloc(c->l_qseq, 1);
- for (i = 0; i < c->l_qseq; ++i) s[i] = seq_nt16_int[bam_seqi(seq, i)];
- r = calloc(xe - xb, 1);
- for (i = xb; i < xe; ++i) {
- if (i >= ref_len || ref[i] == '\0') { xe = i; break; }
- r[i-xb] = seq_nt16_int[seq_nt16_table[(int)ref[i]]];
- }
- state = calloc(c->l_qseq, sizeof(int));
- q = calloc(c->l_qseq, 1);
- kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
- if (!extend_baq) { // in this block, bq[] is capped by base quality qual[]
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i) {
- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
- else bq[i] = bq[i] < q[i]? bq[i] : q[i];
- }
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
- } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!)
- uint8_t *left, *rght;
- left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1);
- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
- int op = cigar[k]&0xf, l = cigar[k]>>4;
- if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
- for (i = y; i < y + l; ++i)
- bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i];
- for (left[y] = bq[y], i = y + 1; i < y + l; ++i)
- left[i] = bq[i] > left[i-1]? bq[i] : left[i-1];
- for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i)
- rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1];
- for (i = y; i < y + l; ++i)
- bq[i] = left[i] < rght[i]? left[i] : rght[i];
- x += l; y += l;
- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
- else if (op == BAM_CDEL) x += l;
- }
- for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ
- free(left); free(rght);
- }
- if (apply_baq) {
- for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
- bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
- } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
- free(bq); free(s); free(r); free(q); free(state);
- }
- return 0;
-}
-
-int bam_prob_realn(bam1_t *b, const char *ref)
-{
- return bam_prob_realn_core(b, ref, INT_MAX, 1);
-}
-
int calmd_usage() {
fprintf(pysam_stderr,
"Usage: samtools calmd [-eubrAES] <aln.bam> <ref.fasta>\n"
" -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"
" -E extended BAQ for better sensitivity but lower specificity\n");
- sam_global_opt_help(pysam_stderr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....@");
return 1;
}
int bam_fillmd(int argc, char *argv[])
{
int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
+ htsThreadPool p = {NULL, 0};
samFile *fp = NULL, *fpout = NULL;
bam_hdr_t *header = NULL;
faidx_t *fai = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0,'@'),
{ NULL, 0, NULL, 0 }
};
flt_flag = UPDATE_NM | UPDATE_MD;
is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
strcpy(mode_w, "w");
- while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad@:", lopts, NULL)) >= 0) {
switch (c) {
case 'r': is_realn = 1; break;
case 'e': flt_flag |= USE_EQUAL; break;
fprintf(pysam_stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
goto fail;
}
-
+
fpout = sam_open_format(pysam_stdout_fn, mode_w, &ga.out);
if (fpout == NULL) {
print_error_errno("calmd", "Failed to open output");
goto fail;
}
+ if (ga.nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ goto fail;
+ }
+ hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
+ hts_set_opt(fpout, HTS_OPT_THREAD_POOL, &p);
+ }
+
ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
fai = fai_load(ref_file);
if (is_realn || capQ > 10) goto fail; // Would otherwise crash
}
}
- if (is_realn) bam_prob_realn_core(b, ref, len, baq_flag);
+ if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
if (capQ > 10) {
- int q = bam_cap_mapQ(b, ref, len, capQ);
+ int q = sam_cap_mapq(b, ref, len, capQ);
if (b->core.qual > q) b->core.qual = q;
}
if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
fprintf(pysam_stderr, "[bam_fillmd] error when closing output file\n");
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 0;
fail:
if (fai) fai_destroy(fai);
if (fp) sam_close(fp);
if (fpout) sam_close(fpout);
+ if (p.pool) hts_tpool_destroy(p.pool);
+
return 1;
}
#include <unistd.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <limits.h>
#include <errno.h>
#include <sys/stat.h>
int bed_overlap(const void *_h, const char *chr, int beg, int end);
typedef struct {
- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all;
int rflag_require, rflag_filter;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
return 1;
}
+static void
+print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
+ int pos, int n, const char *ref, int ref_len)
+{
+ int i;
+ fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
+ for (i = 0; i < n; ++i) {
+ fputs("\t0\t*\t*", fp);
+ if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp);
+ if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp);
+ }
+ putc('\n', fp);
+}
+
static int mplp_func(void *data, bam1_t *b)
{
- extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres);
char *ref;
mplp_aux_t *ma = (mplp_aux_t*)data;
int ret, skip = 0, ref_len;
}
if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
- if (ma->conf->bed) { // test overlap
+ if (ma->conf->bed && ma->conf->all == 0) { // test overlap
skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b));
if (skip) continue;
}
}
skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres);
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
}
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
mplp_ref_t mp_ref = MPLP_REF_INIT;
bam_mplp_t iter;
fprintf(stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(EXIT_FAILURE);
}
- if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
+ if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid;
hts_idx_destroy(idx);
}
else
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
int ret;
+ int last_tid = -1, last_pos = -1;
+
// begin pileup
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
mplp_get_ref(data[0], tid, &ref, &ref_len);
//printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
if (conf->flag & MPLP_BCF) {
int total_depth, _ref0, ref16;
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
}
}
} else {
+ if (conf->all) {
+ // Deal with missing portions of previous tids
+ while (tid > last_tid) {
+ if (last_tid >= 0 && !conf->reg) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2)
+ break;
+ }
+ }
+ if (conf->all) {
+ // Deal with missing portion of current tid
+ while (++last_pos < pos) {
+ if (conf->reg && last_pos < beg0) continue; // out of range; skip
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len);
+ }
+ last_tid = tid;
+ last_pos = pos;
+ }
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
+
fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
for (i = 0; i < n; ++i) {
int j, cnt;
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
} else {
+ int n = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if (c >= conf->min_baseQ)
- pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
+ n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
}
+ if (!n) putc('*', pileup_fp);
+
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
if (c >= conf->min_baseQ) {
c = c + 33 < 126? c + 33 : 126;
putc(c, pileup_fp);
+ n++;
}
}
+ if (!n) putc('*', pileup_fp);
+
if (conf->flag & MPLP_PRINT_MAPQ) {
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
c = plp[i][j].b->core.qual + 33;
if (c > 126) c = 126;
putc(c, pileup_fp);
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
+
if (conf->flag & MPLP_PRINT_POS) {
+ n = 0;
putc('\t', pileup_fp);
- int last = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = bam_get_qual(p->b)[p->qpos];
if ( c < conf->min_baseQ ) continue;
- if (last++) putc(',', pileup_fp);
+ if (n > 0) putc(',', pileup_fp);
fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
}
}
}
}
+ if (conf->all && !(conf->flag & MPLP_BCF)) {
+ // Handle terminating region
+ if (last_tid < 0 && conf->reg && conf->all > 1) {
+ last_tid = tid0;
+ last_pos = beg0-1;
+ mplp_get_ref(data[0], tid0, &ref, &ref_len);
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (last_pos >= end0) break;
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2 || conf->reg)
+ break;
+ }
+ }
+
// clean up
free(bc.tmp.s);
bcf_destroy1(bcf_rec);
return ret;
}
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
#define MAX_PATH_LEN 1024
int read_file_list(const char *file_list,int *n,char **argv[])
{
// check sanity of the file list
buf[len] = 0;
- if (stat(buf, &sb) != 0)
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
{
// no such file, check if it is safe to print its name
int i, safe_to_print = 1;
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
+" -a output all positions (including zero depth)\n"
+" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
" -t, --output-tags LIST optional tags to output:\n"
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
" -P, --platforms STR comma separated list of platforms for indels [all]\n");
- sam_global_opt_help(fp, "-.--.");
+ sam_global_opt_help(fp, "-.--.-");
fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
+ mplp.all = 0;
sam_global_args_init(&mplp.ga);
static const struct option lopts[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{"rf", required_argument, NULL, 1}, // require flag
{"ff", required_argument, NULL, 2}, // filter flag
{"incl-flags", required_argument, NULL, 1},
{"platforms", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
}
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
+ case 'a': mplp.all++; break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
/* else fall-through */
#include <unistd.h>
#include <ctype.h>
#include <string.h>
+#include <strings.h>
#include <limits.h>
#include <errno.h>
#include <sys/stat.h>
int bed_overlap(const void *_h, const char *chr, int beg, int end);
typedef struct {
- int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+ int min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag, all;
int rflag_require, rflag_filter;
int openQ, extQ, tandemQ, min_support; // for indels
double min_frac; // for indels
return 1;
}
+static void
+print_empty_pileup(FILE *fp, const mplp_conf_t *conf, const char *tname,
+ int pos, int n, const char *ref, int ref_len)
+{
+ int i;
+ fprintf(fp, "%s\t%d\t%c", tname, pos+1, (ref && pos < ref_len)? ref[pos] : 'N');
+ for (i = 0; i < n; ++i) {
+ fputs("\t0\t*\t*", fp);
+ if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", fp);
+ if (conf->flag & MPLP_PRINT_POS) fputs("\t*", fp);
+ }
+ putc('\n', fp);
+}
+
static int mplp_func(void *data, bam1_t *b)
{
- extern int bam_realn(bam1_t *b, const char *ref);
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
- extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres);
char *ref;
mplp_aux_t *ma = (mplp_aux_t*)data;
int ret, skip = 0, ref_len;
}
if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
- if (ma->conf->bed) { // test overlap
+ if (ma->conf->bed && ma->conf->all == 0) { // test overlap
skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b));
if (skip) continue;
}
}
skip = 0;
- if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
+ if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3);
if (has_ref && ma->conf->capQ_thres > 10) {
- int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres);
+ int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres);
if (q < 0) skip = 1;
else if (b->core.qual > q) b->core.qual = q;
}
extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, ref_len, max_depth, max_indel_depth;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = INT_MAX, tid0 = 0, ref_len, max_depth, max_indel_depth;
const bam_pileup1_t **plp;
mplp_ref_t mp_ref = MPLP_REF_INIT;
bam_mplp_t iter;
fprintf(pysam_stderr, "[E::%s] fail to parse region '%s' with %s\n", __func__, conf->reg, fn[i]);
exit(EXIT_FAILURE);
}
- if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
+ if (i == 0) beg0 = data[i]->iter->beg, end0 = data[i]->iter->end, tid0 = data[i]->iter->tid;
hts_idx_destroy(idx);
}
else
bam_mplp_set_maxcnt(iter, max_depth);
bcf1_t *bcf_rec = bcf_init1();
int ret;
+ int last_tid = -1, last_pos = -1;
+
// begin pileup
while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
- if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
mplp_get_ref(data[0], tid, &ref, &ref_len);
//printf("tid=%d len=%d ref=%p/%s\n", tid, ref_len, ref, ref);
if (conf->flag & MPLP_BCF) {
int total_depth, _ref0, ref16;
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
}
}
} else {
+ if (conf->all) {
+ // Deal with missing portions of previous tids
+ while (tid > last_tid) {
+ if (last_tid >= 0 && !conf->reg) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2)
+ break;
+ }
+ }
+ if (conf->all) {
+ // Deal with missing portion of current tid
+ while (++last_pos < pos) {
+ if (conf->reg && last_pos < beg0) continue; // out of range; skip
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[tid], last_pos, n, ref, ref_len);
+ }
+ last_tid = tid;
+ last_pos = pos;
+ }
+ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
+
fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
for (i = 0; i < n; ++i) {
int j, cnt;
if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
} else {
+ int n = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = p->qpos < p->b->core.l_qseq
? bam_get_qual(p->b)[p->qpos]
: 0;
if (c >= conf->min_baseQ)
- pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
+ n++, pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
}
+ if (!n) putc('*', pileup_fp);
+
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
if (c >= conf->min_baseQ) {
c = c + 33 < 126? c + 33 : 126;
putc(c, pileup_fp);
+ n++;
}
}
+ if (!n) putc('*', pileup_fp);
+
if (conf->flag & MPLP_PRINT_MAPQ) {
+ n = 0;
putc('\t', pileup_fp);
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
c = plp[i][j].b->core.qual + 33;
if (c > 126) c = 126;
putc(c, pileup_fp);
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
+
if (conf->flag & MPLP_PRINT_POS) {
+ n = 0;
putc('\t', pileup_fp);
- int last = 0;
for (j = 0; j < n_plp[i]; ++j) {
const bam_pileup1_t *p = plp[i] + j;
int c = bam_get_qual(p->b)[p->qpos];
if ( c < conf->min_baseQ ) continue;
- if (last++) putc(',', pileup_fp);
+ if (n > 0) putc(',', pileup_fp);
fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: fprintf(pysam_stdout, ) is very slow...
+ n++;
}
+ if (!n) putc('*', pileup_fp);
}
}
}
}
}
+ if (conf->all && !(conf->flag & MPLP_BCF)) {
+ // Handle terminating region
+ if (last_tid < 0 && conf->reg && conf->all > 1) {
+ last_tid = tid0;
+ last_pos = beg0-1;
+ mplp_get_ref(data[0], tid0, &ref, &ref_len);
+ }
+ while (last_tid >= 0 && last_tid < h->n_targets) {
+ while (++last_pos < h->target_len[last_tid]) {
+ if (last_pos >= end0) break;
+ if (conf->bed && bed_overlap(conf->bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
+ continue;
+ print_empty_pileup(pileup_fp, conf, h->target_name[last_tid], last_pos, n, ref, ref_len);
+ }
+ last_tid++;
+ last_pos = -1;
+ if (conf->all < 2 || conf->reg)
+ break;
+ }
+ }
+
// clean up
free(bc.tmp.s);
bcf_destroy1(bcf_rec);
return ret;
}
+static int is_url(const char *s)
+{
+ static const char uri_scheme_chars[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-";
+ return s[strspn(s, uri_scheme_chars)] == ':';
+}
+
#define MAX_PATH_LEN 1024
int read_file_list(const char *file_list,int *n,char **argv[])
{
// check sanity of the file list
buf[len] = 0;
- if (stat(buf, &sb) != 0)
+ if (! (is_url(buf) || stat(buf, &sb) == 0))
{
// no such file, check if it is safe to print its name
int i, safe_to_print = 1;
"Output options for mpileup format (without -g/-v):\n"
" -O, --output-BP output base positions on reads\n"
" -s, --output-MQ output mapping quality\n"
+" -a output all positions (including zero depth)\n"
+" -a -a (or -aa) output absolutely all positions, including unused ref. sequences\n"
"\n"
"Output options for genotype likelihoods (when -g/-v is used):\n"
" -t, --output-tags LIST optional tags to output:\n"
fprintf(fp,
" -p, --per-sample-mF apply -m and -F per-sample for increased sensitivity\n"
" -P, --platforms STR comma separated list of platforms for indels [all]\n");
- sam_global_opt_help(fp, "-.--.");
+ sam_global_opt_help(fp, "-.--.-");
fprintf(fp,
"\n"
"Notes: Assuming diploid individuals.\n");
mplp.argc = argc; mplp.argv = argv;
mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP;
mplp.output_fname = NULL;
+ mplp.all = 0;
sam_global_args_init(&mplp.ga);
static const struct option lopts[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{"rf", required_argument, NULL, 1}, // require flag
{"ff", required_argument, NULL, 2}, // filter flag
{"incl-flags", required_argument, NULL, 1},
{"platforms", required_argument, NULL, 'P'},
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:a",lopts,NULL)) >= 0) {
switch (c) {
case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break;
case 1 :
}
break;
case 't': mplp.fmt_flag |= parse_format_flag(optarg); break;
+ case 'a': mplp.all++; break;
default:
if (parse_sam_global_opt(c, optarg, lopts, &mplp.ga) == 0) break;
/* else fall-through */
#include <htslib/hts.h>
#include <htslib/sam.h>
-#include <htslib/bgzf.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// attempt to open
htsFile *hts_fp = hts_open(fn, "r");
if (hts_fp == NULL) {
- if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading\n", fn);
+ if (verbose >= 2) fprintf(stderr, "%s could not be opened for reading.\n", fn);
file_state |= 2;
}
else {
// make sure we have sequence data
const htsFormat *fmt = hts_get_format(hts_fp);
if (fmt->category != sequence_data ) {
- if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data\n", fn);
+ if (verbose >= 2) fprintf(stderr, "%s was not identified as sequence data.\n", fn);
file_state |= 4;
}
else {
if (verbose >= 3) fprintf(stderr, "%s is sequence data\n", fn);
// check header
bam_hdr_t *header = sam_hdr_read(hts_fp);
- if (header->n_targets <= 0) {
- if (verbose >= 2) fprintf(stderr, "%s had no targets in header\n", fn);
+ if (header == NULL) {
+ if (verbose >= 2) fprintf(stderr, "%s caused an error whilst reading its header.\n", fn);
file_state |= 8;
- }
- else {
- if (verbose >= 3) fprintf(stderr, "%s has %d targets in header\n", fn, header->n_targets);
- }
-
- // only check EOF on BAM for now
- // TODO implement and use hts_check_EOF() to include CRAM support
- if (fmt->format == bam) {
- if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
- if (verbose >= 2) fprintf(stderr, "%s was missing EOF block\n", fn);
- file_state |= 16;
+ } else {
+ if (header->n_targets <= 0) {
+ if (verbose >= 2) fprintf(stderr, "%s had no targets in header.\n", fn);
+ file_state |= 8;
}
else {
- if (verbose >= 3) fprintf(stderr, "%s has good EOF block\n", fn);
+ if (verbose >= 3) fprintf(stderr, "%s has %d targets in header.\n", fn, header->n_targets);
}
+ bam_hdr_destroy(header);
+ }
+ }
+ // check EOF on formats that support this
+ int ret;
+ if ((ret = hts_check_EOF(hts_fp)) < 0) {
+ if (verbose >= 2) fprintf(stderr, "%s caused an error whilst checking for EOF block.\n", fn);
+ file_state |= 16;
+ }
+ else {
+ switch (ret) {
+ case 0:
+ if (verbose >= 2) fprintf(stderr, "%s was missing EOF block when one should be present.\n", fn);
+ file_state |= 16;
+ break;
+ case 1:
+ if (verbose >= 3) fprintf(stderr, "%s has good EOF block.\n", fn);
+ break;
+ case 2:
+ if (verbose >= 3) fprintf(stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn);
+ break;
+ case 3:
+ if (verbose >= 3) fprintf(stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn);
+ break;
}
}
if (hts_close(hts_fp) < 0) {
file_state |= 32;
- if (verbose >= 2) fprintf(stderr, "%s did not close cleanly\n", fn);
+ if (verbose >= 2) fprintf(stderr, "%s did not close cleanly.\n", fn);
}
}
#include <htslib/hts.h>
#include <htslib/sam.h>
-#include <htslib/bgzf.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// attempt to open
htsFile *hts_fp = hts_open(fn, "r");
if (hts_fp == NULL) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s could not be opened for reading.\n", fn);
file_state |= 2;
}
else {
// make sure we have sequence data
const htsFormat *fmt = hts_get_format(hts_fp);
if (fmt->category != sequence_data ) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s was not identified as sequence data.\n", fn);
file_state |= 4;
}
else {
if (verbose >= 3) fprintf(pysam_stderr, "%s is sequence data\n", fn);
// check header
bam_hdr_t *header = sam_hdr_read(hts_fp);
- if (header->n_targets <= 0) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header\n", fn);
+ if (header == NULL) {
+ if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst reading its header.\n", fn);
file_state |= 8;
- }
- else {
- if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header\n", fn, header->n_targets);
- }
-
- // only check EOF on BAM for now
- // TODO implement and use hts_check_EOF() to include CRAM support
- if (fmt->format == bam) {
- if (bgzf_check_EOF(hts_fp->fp.bgzf) <= 0) {
- if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block\n", fn);
- file_state |= 16;
+ } else {
+ if (header->n_targets <= 0) {
+ if (verbose >= 2) fprintf(pysam_stderr, "%s had no targets in header.\n", fn);
+ file_state |= 8;
}
else {
- if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block\n", fn);
+ if (verbose >= 3) fprintf(pysam_stderr, "%s has %d targets in header.\n", fn, header->n_targets);
}
+ bam_hdr_destroy(header);
+ }
+ }
+ // check EOF on formats that support this
+ int ret;
+ if ((ret = hts_check_EOF(hts_fp)) < 0) {
+ if (verbose >= 2) fprintf(pysam_stderr, "%s caused an error whilst checking for EOF block.\n", fn);
+ file_state |= 16;
+ }
+ else {
+ switch (ret) {
+ case 0:
+ if (verbose >= 2) fprintf(pysam_stderr, "%s was missing EOF block when one should be present.\n", fn);
+ file_state |= 16;
+ break;
+ case 1:
+ if (verbose >= 3) fprintf(pysam_stderr, "%s has good EOF block.\n", fn);
+ break;
+ case 2:
+ if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block as it is not seekable.\n", fn);
+ break;
+ case 3:
+ if (verbose >= 3) fprintf(pysam_stderr, "%s cannot be checked for EOF block because its filetype does not contain one.\n", fn);
+ break;
}
}
if (hts_close(hts_fp) < 0) {
file_state |= 32;
- if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly\n", fn);
+ if (verbose >= 2) fprintf(pysam_stderr, "%s did not close cleanly.\n", fn);
}
}
goto fail;
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) < 0) goto write_fail;
}
while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
int32_put_blk(b, header_len);
cram_block_append(b, sam_hdr_str(hdr), header_len);
// Zero the remaining block
- memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
+ memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0,
cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
// Make sure all sizes and byte-offsets are consistent after memset
cram_block_set_offset(b, cram_block_get_uncomp_size(b));
goto fail;
}
if (in->block_offset < in->block_length) {
- if (bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
+ if (bgzf_write(fp, (char *)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset) < 0) goto write_fail;
if (bgzf_flush(fp) < 0) goto write_fail;
}
while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
int32_put_blk(b, header_len);
cram_block_append(b, sam_hdr_str(hdr), header_len);
// Zero the remaining block
- memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
+ memset((char *)cram_block_get_data(b)+cram_block_get_offset(b), 0,
cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
// Make sure all sizes and byte-offsets are consistent after memset
cram_block_set_offset(b, cram_block_get_uncomp_size(b));
}
}
-static void usage(FILE *fp, int ret) {
+static int usage(FILE *fp, int ret) {
fprintf(fp,
"Usage: samtools reheader [-P] in.header.sam in.bam > out.bam\n"
" or samtools reheader [-P] -i in.header.sam file.bam\n"
" -P, --no-PG Do not generate an @PG header line.\n"
" -i, --in-place Modify the bam/cram file directly.\n"
" (Defaults to outputting to pysam_stdout.)\n");
- exit(ret);
+ return(ret);
}
int main_reheader(int argc, char *argv[])
switch (c) {
case 'P': add_PG = 0; break;
case 'i': inplace = 1; break;
- case 'h': usage(pysam_stdout, 0); break;
+ case 'h': return(usage(pysam_stdout, 0)); break;
default:
fprintf(pysam_stderr, "Invalid option '%c'\n", c);
- usage(pysam_stderr, 1);
+ return(usage(pysam_stderr, 1));
}
}
if (argc - optind != 2)
- usage(pysam_stderr, 1);
+ return(usage(pysam_stderr, 1));
{ // read the header
samFile *fph = sam_open(argv[optind], "r");
fprintf(stderr, "Option: -s rmdup for SE reads\n");
fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n");
- sam_global_opt_help(stderr, "-....");
+ sam_global_opt_help(stderr, "-....-");
return 1;
}
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
fprintf(pysam_stderr, "Option: -s rmdup for SE reads\n");
fprintf(pysam_stderr, " -S treat PE reads as SE in rmdup (force -s)\n");
- sam_global_opt_help(pysam_stderr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....-");
return 1;
}
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
#include "htslib/kstring.h"
#include "htslib/sam.h"
#include "sam_opts.h"
+#include "samtools.h"
+
+/* Minimum memory required in megabytes before sort will attempt to run. This
+ is to prevent accidents where failing to use the -m option correctly results
+ in the creation of a temporary file for each read in the input file.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_MIN_MEGS_PER_THREAD = 1;
+
+/* Default per-thread memory for sort. Must be >= SORT_MIN_MEGS_PER_THREAD.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_DEFAULT_MEGS_PER_THREAD = 768;
#if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L
#define NEED_MEMSET_PATTERN4
@param flag flags that control how the merge is undertaken
@param reg region to merge
@param n_threads number of threads to use (passed to htslib)
+ @param cmd command name (used in print_error() etc)
@param in_fmt format options for input files
@param out_fmt output file format and options
@discussion Padding information may NOT correctly maintained. This
*/
int bam_merge_core2(int by_qname, const char *out, const char *mode,
const char *headers, int n, char * const *fn, int flag,
- const char *reg, int n_threads,
+ const char *reg, int n_threads, const char *cmd,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
samFile *fpout, **fp = NULL;
if (headers) {
samFile* fpheaders = sam_open(headers, "r");
if (fpheaders == NULL) {
- const char *message = strerror(errno);
- fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ print_error_errno(cmd, "cannot open \"%s\"", headers);
return -1;
}
hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
if (hin == NULL) {
- fprintf(stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
- headers);
- goto mem_fail;
- }
- } else {
- hout = bam_hdr_init();
- if (!hout) {
- fprintf(stderr, "[bam_merge_core] couldn't allocate bam header\n");
+ print_error(cmd, "couldn't read headers from \"%s\"", headers);
goto mem_fail;
}
- hout->text = strdup("");
- if (!hout->text) goto mem_fail;
}
g_is_by_qname = by_qname;
bam_hdr_t *hin;
fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
- fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
- fprintf(stderr, "[bam_merge_core] failed to read header for '%s'\n",
- fn[i]);
+ print_error(cmd, "failed to read header from \"%s\"", fn[i]);
goto fail;
}
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
+
+ // Potential future improvement is to share headers between CRAM files for
+ // samtools sort (where all headers are identical.
+ // Eg:
+ //
+ // if (i > 1) {
+ // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram));
+ // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram));
+ // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram));
+ // }
}
// Did we get an @HD line?
bam_destroy1(h->b);
h->b = NULL;
} else {
- fprintf(stderr, "[%s] failed to read first record from %s\n",
- __func__, fn[i]);
+ print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
}
}
// Open output file and write header
if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
- fprintf(stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno));
+ print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
if (sam_hdr_write(fpout, hout) != 0) {
- fprintf(stderr, "[%s] failed to write header.\n", __func__);
+ print_error_errno(cmd, "failed to write header to \"%s\"", out);
sam_close(fpout);
return -1;
}
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
if (sam_write1(fpout, hout, b) < 0) {
- fprintf(stderr, "[%s] failed to write to output file.\n", __func__);
+ print_error_errno(cmd, "failed writing to \"%s\"", out);
sam_close(fpout);
return -1;
}
bam_destroy1(heap->b);
heap->b = NULL;
} else {
- fprintf(stderr, "[bam_merge_core] error: '%s' is truncated.\n",
- fn[heap->i]);
+ print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
}
ks_heapadjust(heap, 0, n, heap);
free_merged_header(merged_hdr);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
if (sam_close(fpout) < 0) {
- fprintf(stderr, "[bam_merge_core] error closing output file\n");
+ print_error(cmd, "error closing output file");
return -1;
}
return 0;
mem_fail:
- fprintf(stderr, "[bam_merge_core] Out of memory\n");
+ print_error(cmd, "Out of memory");
fail:
if (flag & MERGE_RG) {
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL);
+ return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
}
static void merge_usage(FILE *to)
" -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n"
" -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n"
" -s VALUE Override random seed\n"
-" -b FILE List of input BAM filenames, one per line [null]\n"
-" -@, --threads INT\n"
-" Number of BAM/CRAM compression threads [0]\n");
- sam_global_opt_help(to, "-.O..");
+" -b FILE List of input BAM filenames, one per line [null]\n");
+ sam_global_opt_help(to, "-.O..@");
}
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1;
char *fn_headers = NULL, *reg = NULL, mode[12];
long random_seed = (long)time(NULL);
char** fn = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
case 'u': flag |= MERGE_UNCOMP; level = 0; break;
case 'R': reg = strdup(optarg); break;
case 'l': level = atoi(optarg); break;
- case '@': n_threads = atoi(optarg); break;
case 'c': flag |= MERGE_COMBINE_RG; break;
case 'p': flag |= MERGE_COMBINE_PG; break;
case 's': random_seed = atol(optarg); break;
if (fn == NULL) { ret = 1; goto end; }
memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*));
fn_size += nfiles;
+ free(fn_read);
}
else {
- fprintf(stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg);
+ print_error("merge", "Invalid file list \"%s\"", optarg);
ret = 1;
}
break;
}
}
if ( argc - optind < 1 ) {
- fprintf(stderr, "You must at least specify the output file.\n");
+ print_error("merge", "You must at least specify the output file");
merge_usage(stderr);
return 1;
}
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
if (fn_size+nargcfiles < 1) {
- fprintf(stderr, "You must specify at least one (and usually two or more) input files.\n");
+ print_error("merge", "You must specify at least one (and usually two or more) input files");
merge_usage(stderr);
return 1;
}
sam_open_mode(mode+1, argv[optind], NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
- fn_size+nargcfiles, fn, flag, reg, n_threads,
- &ga.in, &ga.out) < 0)
+ fn_size+nargcfiles, fn, flag, reg, ga.nthreads,
+ "merge", &ga.in, &ga.out) < 0)
ret = 1;
end:
name = (char*)calloc(strlen(w->prefix) + 20, 1);
if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
- w->error = errno;
-
-// Consider using CRAM temporary files if the final output is CRAM.
-// Typically it is comparable speed while being smaller.
-// hts_opt opt[2] = {
-// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL},
-// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
-// };
-// opt[0].next = &opt[1];
-// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0)
-// w->error = errno;
+
+ uint32_t max_ncigar = 0;
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ uint32_t nc = w->buf[i]->core.n_cigar;
+ if (max_ncigar < nc)
+ max_ncigar = nc;
+ }
+
+ if (max_ncigar > 65535) {
+ htsFormat fmt;
+ memset(&fmt, 0, sizeof(fmt));
+ if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) {
+ w->error = errno;
+ free(name);
+ return 0;
+ }
+
+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0)
+ w->error = errno;
+ } else {
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
+ w->error = errno;
+ }
free(name);
return 0;
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
if (w[i].error != 0) {
- fprintf(stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error));
+ errno = w[i].error;
+ print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
n_failed++;
}
}
buf = NULL;
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
- const char *message = strerror(errno);
- fprintf(stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ print_error_errno("sort", "can't open \"%s\"", fn);
return -2;
}
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(stderr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ print_error("sort", "failed to read header from \"%s\"", fn);
goto err;
}
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
+
+ // No gain to using the thread pool here as the flow of this code
+ // is such that we are *either* reading *or* sorting. Hence a shared
+ // pool makes no real difference except to reduce the thread count a little.
+ if (n_threads > 1)
+ hts_set_threads(fp, n_threads);
+
// write sub files
for (;;) {
if (k == max_k) {
}
}
if (ret != -1) {
- fprintf(stderr, "[bam_sort_core] truncated file. Aborting.\n");
+ print_error("sort", "truncated file. Aborting");
ret = -1;
goto err;
}
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
- fprintf(stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno));
+ print_error_errno("sort", "failed to create \"%s\"", fnout);
ret = -1;
goto err;
}
}
if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
- NULL, n_threads, in_fmt, out_fmt) < 0) {
+ NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -n Sort by read name\n"
" -o FILE Write final output to FILE rather than standard output\n"
-" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
-" -@, --threads INT\n"
-" Set number of sorting and compression threads [1]\n");
- sam_global_opt_help(fp, "-.O..");
+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n");
+ sam_global_opt_help(fp, "-.O..@");
+}
+
+static void complain_about_memory_setting(size_t max_mem) {
+ char *suffix = "";
+ const size_t nine_k = 9<<10;
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; }
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; }
+
+ fprintf(stderr,
+"[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n"
+"Trying to run with -m too small can lead to the creation of a very large number\n"
+"of temporary files. This may make sort fail due to it exceeding limits on the\n"
+"number of files it can have open at the same time.\n\n"
+"Please check your -m parameter. It should be an integer followed by one of the\n"
+"letters K (for kilobytes), M (megabytes) or G (gigabytes). You should ensure it\n"
+"is at least the minimum above, and much higher if you are sorting a large file.\n",
+ max_mem, suffix, SORT_MIN_MEGS_PER_THREAD);
}
int bam_sort(int argc, char *argv[])
{
- size_t max_mem = 768<<20; // 512MB
- int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
+ size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
break;
}
case 'T': kputs(optarg, &tmpprefix); break;
- case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
goto sort_end;
}
+ if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) {
+ complain_about_memory_setting(max_mem);
+ ret = EXIT_FAILURE;
+ goto sort_end;
+ }
+
strcpy(modeout, "wb");
sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
}
ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem, n_threads,
+ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out);
if (ret >= 0)
ret = EXIT_SUCCESS;
#include "htslib/kstring.h"
#include "htslib/sam.h"
#include "sam_opts.h"
+#include "samtools.h"
+
+/* Minimum memory required in megabytes before sort will attempt to run. This
+ is to prevent accidents where failing to use the -m option correctly results
+ in the creation of a temporary file for each read in the input file.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_MIN_MEGS_PER_THREAD = 1;
+
+/* Default per-thread memory for sort. Must be >= SORT_MIN_MEGS_PER_THREAD.
+ Don't forget to update the man page if you change this. */
+const size_t SORT_DEFAULT_MEGS_PER_THREAD = 768;
#if !defined(__DARWIN_C_LEVEL) || __DARWIN_C_LEVEL < 900000L
#define NEED_MEMSET_PATTERN4
@param flag flags that control how the merge is undertaken
@param reg region to merge
@param n_threads number of threads to use (passed to htslib)
+ @param cmd command name (used in print_error() etc)
@param in_fmt format options for input files
@param out_fmt output file format and options
@discussion Padding information may NOT correctly maintained. This
*/
int bam_merge_core2(int by_qname, const char *out, const char *mode,
const char *headers, int n, char * const *fn, int flag,
- const char *reg, int n_threads,
+ const char *reg, int n_threads, const char *cmd,
const htsFormat *in_fmt, const htsFormat *out_fmt)
{
samFile *fpout, **fp = NULL;
if (headers) {
samFile* fpheaders = sam_open(headers, "r");
if (fpheaders == NULL) {
- const char *message = strerror(errno);
- fprintf(pysam_stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ print_error_errno(cmd, "cannot open \"%s\"", headers);
return -1;
}
hin = sam_hdr_read(fpheaders);
sam_close(fpheaders);
if (hin == NULL) {
- fprintf(pysam_stderr, "[bam_merge_core] couldn't read headers for '%s'\n",
- headers);
- goto mem_fail;
- }
- } else {
- hout = bam_hdr_init();
- if (!hout) {
- fprintf(pysam_stderr, "[bam_merge_core] couldn't allocate bam header\n");
+ print_error(cmd, "couldn't read headers from \"%s\"", headers);
goto mem_fail;
}
- hout->text = strdup("");
- if (!hout->text) goto mem_fail;
}
g_is_by_qname = by_qname;
bam_hdr_t *hin;
fp[i] = sam_open_format(fn[i], "r", in_fmt);
if (fp[i] == NULL) {
- fprintf(pysam_stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
+ print_error_errno(cmd, "fail to open \"%s\"", fn[i]);
goto fail;
}
hin = sam_hdr_read(fp[i]);
if (hin == NULL) {
- fprintf(pysam_stderr, "[bam_merge_core] failed to read header for '%s'\n",
- fn[i]);
+ print_error(cmd, "failed to read header from \"%s\"", fn[i]);
goto fail;
}
if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
fprintf(pysam_stderr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
}
+
+ // Potential future improvement is to share headers between CRAM files for
+ // samtools sort (where all headers are identical.
+ // Eg:
+ //
+ // if (i > 1) {
+ // sam_hdr_free(cram_fd_get_header(fp[i]->fp.cram));
+ // cram_fd_set_header(fp[i]->fp.cram, cram_fd_get_header(fp[0]->fp.cram));
+ // sam_hdr_incr_ref(cram_fd_get_header(fp[0]->fp.cram));
+ // }
}
// Did we get an @HD line?
bam_destroy1(h->b);
h->b = NULL;
} else {
- fprintf(pysam_stderr, "[%s] failed to read first record from %s\n",
- __func__, fn[i]);
+ print_error(cmd, "failed to read first record from \"%s\"", fn[i]);
goto fail;
}
}
// Open output file and write header
if ((fpout = sam_open_format(out, mode, out_fmt)) == 0) {
- fprintf(pysam_stderr, "[%s] failed to create \"%s\": %s\n", __func__, out, strerror(errno));
+ print_error_errno(cmd, "failed to create \"%s\"", out);
return -1;
}
if (sam_hdr_write(fpout, hout) != 0) {
- fprintf(pysam_stderr, "[%s] failed to write header.\n", __func__);
+ print_error_errno(cmd, "failed to write header to \"%s\"", out);
sam_close(fpout);
return -1;
}
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
}
if (sam_write1(fpout, hout, b) < 0) {
- fprintf(pysam_stderr, "[%s] failed to write to output file.\n", __func__);
+ print_error_errno(cmd, "failed writing to \"%s\"", out);
sam_close(fpout);
return -1;
}
bam_destroy1(heap->b);
heap->b = NULL;
} else {
- fprintf(pysam_stderr, "[bam_merge_core] error: '%s' is truncated.\n",
- fn[heap->i]);
+ print_error(cmd, "\"%s\" is truncated", fn[heap->i]);
goto fail;
}
ks_heapadjust(heap, 0, n, heap);
free_merged_header(merged_hdr);
free(RG); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
if (sam_close(fpout) < 0) {
- fprintf(pysam_stderr, "[bam_merge_core] error closing output file\n");
+ print_error(cmd, "error closing output file");
return -1;
}
return 0;
mem_fail:
- fprintf(pysam_stderr, "[bam_merge_core] Out of memory\n");
+ print_error(cmd, "Out of memory");
fail:
if (flag & MERGE_RG) {
strcpy(mode, "wb");
if (flag & MERGE_UNCOMP) strcat(mode, "0");
else if (flag & MERGE_LEVEL1) strcat(mode, "1");
- return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, NULL, NULL);
+ return bam_merge_core2(by_qname, out, mode, headers, n, fn, flag, reg, 0, "merge", NULL, NULL);
}
static void merge_usage(FILE *to)
" -c Combine @RG headers with colliding IDs [alter IDs to be distinct]\n"
" -p Combine @PG headers with colliding IDs [alter IDs to be distinct]\n"
" -s VALUE Override random seed\n"
-" -b FILE List of input BAM filenames, one per line [null]\n"
-" -@, --threads INT\n"
-" Number of BAM/CRAM compression threads [0]\n");
- sam_global_opt_help(to, "-.O..");
+" -b FILE List of input BAM filenames, one per line [null]\n");
+ sam_global_opt_help(to, "-.O..@");
}
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1;
+ int c, is_by_qname = 0, flag = 0, ret = 0, level = -1;
char *fn_headers = NULL, *reg = NULL, mode[12];
long random_seed = (long)time(NULL);
char** fn = NULL;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
case 'u': flag |= MERGE_UNCOMP; level = 0; break;
case 'R': reg = strdup(optarg); break;
case 'l': level = atoi(optarg); break;
- case '@': n_threads = atoi(optarg); break;
case 'c': flag |= MERGE_COMBINE_RG; break;
case 'p': flag |= MERGE_COMBINE_PG; break;
case 's': random_seed = atol(optarg); break;
if (fn == NULL) { ret = 1; goto end; }
memcpy(fn+fn_size, fn_read, nfiles * sizeof(char*));
fn_size += nfiles;
+ free(fn_read);
}
else {
- fprintf(pysam_stderr, "[%s] Invalid file list \"%s\"\n", __func__, optarg);
+ print_error("merge", "Invalid file list \"%s\"", optarg);
ret = 1;
}
break;
}
}
if ( argc - optind < 1 ) {
- fprintf(pysam_stderr, "You must at least specify the output file.\n");
+ print_error("merge", "You must at least specify the output file");
merge_usage(pysam_stderr);
return 1;
}
memcpy(fn+fn_size, argv + (optind+1), nargcfiles * sizeof(char*));
}
if (fn_size+nargcfiles < 1) {
- fprintf(pysam_stderr, "You must specify at least one (and usually two or more) input files.\n");
+ print_error("merge", "You must specify at least one (and usually two or more) input files");
merge_usage(pysam_stderr);
return 1;
}
sam_open_mode(mode+1, argv[optind], NULL);
if (level >= 0) sprintf(strchr(mode, '\0'), "%d", level < 9? level : 9);
if (bam_merge_core2(is_by_qname, argv[optind], mode, fn_headers,
- fn_size+nargcfiles, fn, flag, reg, n_threads,
- &ga.in, &ga.out) < 0)
+ fn_size+nargcfiles, fn, flag, reg, ga.nthreads,
+ "merge", &ga.in, &ga.out) < 0)
ret = 1;
end:
name = (char*)calloc(strlen(w->prefix) + 20, 1);
if (!name) { w->error = errno; return 0; }
sprintf(name, "%s.%.4d.bam", w->prefix, w->index);
- if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
- w->error = errno;
-
-// Consider using CRAM temporary files if the final output is CRAM.
-// Typically it is comparable speed while being smaller.
-// hts_opt opt[2] = {
-// {"version=3.0", CRAM_OPT_VERSION, {"3.0"}, NULL},
-// {"no_ref", CRAM_OPT_NO_REF, {1}, NULL}
-// };
-// opt[0].next = &opt[1];
-// if (write_buffer(name, "wc1", w->buf_len, w->buf, w->h, 0, opt) < 0)
-// w->error = errno;
+
+ uint32_t max_ncigar = 0;
+ int i;
+ for (i = 0; i < w->buf_len; i++) {
+ uint32_t nc = w->buf[i]->core.n_cigar;
+ if (max_ncigar < nc)
+ max_ncigar = nc;
+ }
+
+ if (max_ncigar > 65535) {
+ htsFormat fmt;
+ memset(&fmt, 0, sizeof(fmt));
+ if (hts_parse_format(&fmt, "cram,version=3.0,no_ref,seqs_per_slice=1000") < 0) {
+ w->error = errno;
+ free(name);
+ return 0;
+ }
+
+ if (write_buffer(name, "wcx1", w->buf_len, w->buf, w->h, 0, &fmt) < 0)
+ w->error = errno;
+ } else {
+ if (write_buffer(name, "wbx1", w->buf_len, w->buf, w->h, 0, NULL) < 0)
+ w->error = errno;
+ }
free(name);
return 0;
for (i = 0; i < n_threads; ++i) {
pthread_join(tid[i], 0);
if (w[i].error != 0) {
- fprintf(pysam_stderr, "[bam_sort_core] failed to create temporary file \"%s.%.4d.bam\": %s\n", prefix, w[i].index, strerror(w[i].error));
+ errno = w[i].error;
+ print_error_errno("sort", "failed to create temporary file \"%s.%.4d.bam\"", prefix, w[i].index);
n_failed++;
}
}
buf = NULL;
fp = sam_open_format(fn, "r", in_fmt);
if (fp == NULL) {
- const char *message = strerror(errno);
- fprintf(pysam_stderr, "[bam_sort_core] fail to open '%s': %s\n", fn, message);
+ print_error_errno("sort", "can't open \"%s\"", fn);
return -2;
}
header = sam_hdr_read(fp);
if (header == NULL) {
- fprintf(pysam_stderr, "[bam_sort_core] failed to read header for '%s'\n", fn);
+ print_error("sort", "failed to read header from \"%s\"", fn);
goto err;
}
if (is_by_qname) change_SO(header, "queryname");
else change_SO(header, "coordinate");
+
+ // No gain to using the thread pool here as the flow of this code
+ // is such that we are *either* reading *or* sorting. Hence a shared
+ // pool makes no real difference except to reduce the thread count a little.
+ if (n_threads > 1)
+ hts_set_threads(fp, n_threads);
+
// write sub files
for (;;) {
if (k == max_k) {
}
}
if (ret != -1) {
- fprintf(pysam_stderr, "[bam_sort_core] truncated file. Aborting.\n");
+ print_error("sort", "truncated file. Aborting");
ret = -1;
goto err;
}
if (n_files == 0) { // a single block
ks_mergesort(sort, k, buf, 0);
if (write_buffer(fnout, modeout, k, buf, header, n_threads, out_fmt) != 0) {
- fprintf(pysam_stderr, "[bam_sort_core] failed to create \"%s\": %s\n", fnout, strerror(errno));
+ print_error_errno("sort", "failed to create \"%s\"", fnout);
ret = -1;
goto err;
}
}
if (bam_merge_core2(is_by_qname, fnout, modeout, NULL, n_files, fns,
MERGE_COMBINE_RG|MERGE_COMBINE_PG|MERGE_FIRST_CO,
- NULL, n_threads, in_fmt, out_fmt) < 0) {
+ NULL, n_threads, "sort", in_fmt, out_fmt) < 0) {
// Propagate bam_merge_core2() failure; it has already emitted a
// message explaining the failure, so no further message is needed.
goto err;
" -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]\n"
" -n Sort by read name\n"
" -o FILE Write final output to FILE rather than standard output\n"
-" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n"
-" -@, --threads INT\n"
-" Set number of sorting and compression threads [1]\n");
- sam_global_opt_help(fp, "-.O..");
+" -T PREFIX Write temporary files to PREFIX.nnnn.bam\n");
+ sam_global_opt_help(fp, "-.O..@");
+}
+
+static void complain_about_memory_setting(size_t max_mem) {
+ char *suffix = "";
+ const size_t nine_k = 9<<10;
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "K"; }
+ if (max_mem > nine_k) { max_mem >>= 10; suffix = "M"; }
+
+ fprintf(pysam_stderr,
+"[bam_sort] -m setting (%zu%s bytes) is less than the minimum required (%zuM).\n\n"
+"Trying to run with -m too small can lead to the creation of a very large number\n"
+"of temporary files. This may make sort fail due to it exceeding limits on the\n"
+"number of files it can have open at the same time.\n\n"
+"Please check your -m parameter. It should be an integer followed by one of the\n"
+"letters K (for kilobytes), M (megabytes) or G (gigabytes). You should ensure it\n"
+"is at least the minimum above, and much higher if you are sorting a large file.\n",
+ max_mem, suffix, SORT_MIN_MEGS_PER_THREAD);
}
int bam_sort(int argc, char *argv[])
{
- size_t max_mem = 768<<20; // 512MB
- int c, nargs, is_by_qname = 0, ret, o_seen = 0, n_threads = 0, level = -1;
+ size_t max_mem = SORT_DEFAULT_MEGS_PER_THREAD << 20;
+ int c, nargs, is_by_qname = 0, ret, o_seen = 0, level = -1;
char *fnout = "-", modeout[12];
kstring_t tmpprefix = { 0, 0, NULL };
struct stat st;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 0, '@'),
{ "threads", required_argument, NULL, '@' },
{ NULL, 0, NULL, 0 }
};
break;
}
case 'T': kputs(optarg, &tmpprefix); break;
- case '@': n_threads = atoi(optarg); break;
case 'l': level = atoi(optarg); break;
default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
goto sort_end;
}
+ if (max_mem < (SORT_MIN_MEGS_PER_THREAD << 20)) {
+ complain_about_memory_setting(max_mem);
+ ret = EXIT_FAILURE;
+ goto sort_end;
+ }
+
strcpy(modeout, "wb");
sam_open_mode(modeout+1, fnout, NULL);
if (level >= 0) sprintf(strchr(modeout, '\0'), "%d", level < 9? level : 9);
}
ret = bam_sort_core_ext(is_by_qname, (nargs > 0)? argv[optind] : "-",
- tmpprefix.s, fnout, modeout, max_mem, n_threads,
+ tmpprefix.s, fnout, modeout, max_mem, ga.nthreads,
&ga.in, &ga.out);
if (ret >= 0)
ret = EXIT_SUCCESS;
/* bam_split.c -- split subcommand.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Martin Pollard <mp15@sanger.ac.uk>
#include <regex.h>
#include <htslib/khash.h>
#include <htslib/kstring.h>
+#include <htslib/cram.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
+#include "samtools.h"
KHASH_MAP_INIT_STR(c2i, int)
samFile** rg_output_file;
bam_hdr_t** rg_output_header;
kh_c2i_t* rg_hash;
+ htsThreadPool p;
};
typedef struct state state_t;
" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
" -u FILE1:FILE2 ...and override the header with FILE2\n"
" -v verbose output\n");
- sam_global_opt_help(write_to, "-....");
+ sam_global_opt_help(write_to, "-....@");
fprintf(write_to,
"\n"
"Format string expansions:\n"
{
if (argc == 1) { usage(stdout); return NULL; }
- const char* optstring = "vf:u:";
+ const char* optstring = "vf:u:@:";
char* delim;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
argv += optind;
if (argc != 1) {
- fprintf(stderr, "Invalid number of arguments: %d\n", argc);
+ print_error("split", "Invalid number of arguments: %d", argc);
usage(stderr);
free(retval);
return NULL;
// Filters a header of @RG lines where ID != id_keep
// TODO: strip @PG's descended from other RGs and their descendants
-static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
+static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list)
{
kstring_t str = {0, 0, NULL};
free(hdr->text);
hdr->text = ks_release(&str);
+ // Add the PG line
+ SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text);
+ if (sam_hdr_add_PG(sh, "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ return -1;
+
+ free(hdr->text);
+ hdr->text = strdup(sam_hdr_str(sh));
+ hdr->l_text = sam_hdr_length(sh);
+ if (!hdr->text)
+ return false;
+ sam_hdr_free(sh);
+
return true;
}
// Set the initial state
-static state_t* init(parsed_opts_t* opts)
+static state_t* init(parsed_opts_t* opts, const char *arg_list)
{
state_t* retval = calloc(sizeof(state_t), 1);
if (!retval) {
- fprintf(stderr, "Out of memory");
+ print_error_errno("split", "Initialisation failed");
return NULL;
}
+ if (opts->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ return NULL;
+ }
+ }
+
retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
- fprintf(stderr, "Could not open input file (%s)\n", opts->merged_input_name);
+ print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
free(retval);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p);
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
if (retval->merged_input_header == NULL) {
- fprintf(stderr, "Could not read header for file '%s'\n",
- opts->merged_input_name);
+ print_error("split", "Could not read header from \"%s\"", opts->merged_input_name);
cleanup_state(retval, false);
return NULL;
}
if (opts->unaccounted_header_name) {
samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
- fprintf(stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
+ print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
if (retval->unaccounted_header == NULL) {
- fprintf(stderr, "Could not read header for file '%s'\n",
- opts->unaccounted_header_name);
+ print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
- fprintf(stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
+ print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
cleanup_state(retval, false);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
}
// Open output files for RGs
retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
retval->rg_hash = kh_init_c2i();
if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) {
- fprintf(stderr, "Could not allocate memory for output file array. Out of memory?");
+ print_error_errno("split", "Could not initialise output file array");
cleanup_state(retval, false);
return NULL;
}
char* dirsep = strrchr(opts->merged_input_name, '/');
char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
if (!input_base_name) {
- fprintf(stderr, "Out of memory\n");
+ print_error_errno("split", "Filename manipulation failed");
cleanup_state(retval, false);
return NULL;
}
&opts->ga.out);
if ( output_filename == NULL ) {
- fprintf(stderr, "Error expanding output filename format string.\n");
+ print_error("split", "Error expanding output filename format string");
cleanup_state(retval, false);
free(input_base_name);
return NULL;
retval->rg_output_file_name[i] = output_filename;
retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
- fprintf(stderr, "Could not open output file: %s\n", output_filename);
+ print_error_errno("split", "Could not open \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
// Record index in hash
int ret;
// Set and edit header
retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
- fprintf(stderr, "Could not rewrite header for file: %s\n", output_filename);
+ if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) {
+ print_error("split", "Could not rewrite header for \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
static bool split(state_t* state)
{
if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
- fprintf(stderr, "Could not write output file header\n");
+ print_error_errno("split", "Could not write output file header");
return false;
}
size_t i;
for (i = 0; i < state->output_count; i++) {
if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
- fprintf(stderr, "Could not write output file header for '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
return false;
}
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(stderr, "Could not read first input record\n");
+ print_error("split", "Could not read first input record");
return false;
}
}
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
- fprintf(stderr, "Could not write to output file '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
bam_destroy1(file_read);
return false;
}
return false;
} else {
if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
- fprintf(stderr, "Could not write to unaccounted output file\n");
+ print_error_errno("split", "Could not write to unaccounted output file");
bam_destroy1(file_read);
return false;
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(stderr, "Could not read input record\n");
+ print_error("split", "Could not read input record");
return false;
}
}
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
if (status->unaccounted_file) {
if (sam_close(status->unaccounted_file) < 0 && check_close) {
- fprintf(stderr, "Error on closing unaccounted file\n");
+ print_error("split", "Error on closing unaccounted file");
ret = -1;
}
}
bam_hdr_destroy(status->rg_output_header[i]);
if (status->rg_output_file && status->rg_output_file[i]) {
if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
- fprintf(stderr, "Error on closing output file '%s'\n",
- status->rg_output_file_name[i]);
+ print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
ret = -1;
}
}
free(status->rg_id);
free(status);
+ if (status->p.pool)
+ hts_tpool_destroy(status->p.pool);
+
return ret;
}
int main_split(int argc, char** argv)
{
int ret = 1;
+ char *arg_list = stringify_argv(argc+1, argv-1);
parsed_opts_t* opts = parse_args(argc, argv);
if (!opts) goto cleanup_opts;
- state_t* status = init(opts);
+ state_t* status = init(opts, arg_list);
if (!status) goto cleanup_opts;
if (!split(status)) {
cleanup_opts:
cleanup_opts(opts);
+ free(arg_list);
return ret;
}
/* bam_split.c -- split subcommand.
- Copyright (C) 2013-2015 Genome Research Ltd.
+ Copyright (C) 2013-2016 Genome Research Ltd.
Author: Martin Pollard <mp15@sanger.ac.uk>
#include <regex.h>
#include <htslib/khash.h>
#include <htslib/kstring.h>
+#include <htslib/cram.h>
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
+#include "samtools.h"
KHASH_MAP_INIT_STR(c2i, int)
samFile** rg_output_file;
bam_hdr_t** rg_output_header;
kh_c2i_t* rg_hash;
+ htsThreadPool p;
};
typedef struct state state_t;
" -u FILE1 put reads with no RG tag or an unrecognised RG tag in FILE1\n"
" -u FILE1:FILE2 ...and override the header with FILE2\n"
" -v verbose output\n");
- sam_global_opt_help(write_to, "-....");
+ sam_global_opt_help(write_to, "-....@");
fprintf(write_to,
"\n"
"Format string expansions:\n"
{
if (argc == 1) { usage(pysam_stdout); return NULL; }
- const char* optstring = "vf:u:";
+ const char* optstring = "vf:u:@:";
char* delim;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
argv += optind;
if (argc != 1) {
- fprintf(pysam_stderr, "Invalid number of arguments: %d\n", argc);
+ print_error("split", "Invalid number of arguments: %d", argc);
usage(pysam_stderr);
free(retval);
return NULL;
// Filters a header of @RG lines where ID != id_keep
// TODO: strip @PG's descended from other RGs and their descendants
-static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep)
+static bool filter_header_rg(bam_hdr_t* hdr, const char* id_keep, const char *arg_list)
{
kstring_t str = {0, 0, NULL};
free(hdr->text);
hdr->text = ks_release(&str);
+ // Add the PG line
+ SAM_hdr *sh = sam_hdr_parse_(hdr->text, hdr->l_text);
+ if (sam_hdr_add_PG(sh, "samtools",
+ "VN", samtools_version(),
+ arg_list ? "CL": NULL,
+ arg_list ? arg_list : NULL,
+ NULL) != 0)
+ return -1;
+
+ free(hdr->text);
+ hdr->text = strdup(sam_hdr_str(sh));
+ hdr->l_text = sam_hdr_length(sh);
+ if (!hdr->text)
+ return false;
+ sam_hdr_free(sh);
+
return true;
}
// Set the initial state
-static state_t* init(parsed_opts_t* opts)
+static state_t* init(parsed_opts_t* opts, const char *arg_list)
{
state_t* retval = calloc(sizeof(state_t), 1);
if (!retval) {
- fprintf(pysam_stderr, "Out of memory");
+ print_error_errno("split", "Initialisation failed");
return NULL;
}
+ if (opts->ga.nthreads > 0) {
+ if (!(retval->p.pool = hts_tpool_init(opts->ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ return NULL;
+ }
+ }
+
retval->merged_input_file = sam_open_format(opts->merged_input_name, "rb", &opts->ga.in);
if (!retval->merged_input_file) {
- fprintf(pysam_stderr, "Could not open input file (%s)\n", opts->merged_input_name);
+ print_error_errno("split", "Could not open \"%s\"", opts->merged_input_name);
free(retval);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->merged_input_file, HTS_OPT_THREAD_POOL, &retval->p);
retval->merged_input_header = sam_hdr_read(retval->merged_input_file);
if (retval->merged_input_header == NULL) {
- fprintf(pysam_stderr, "Could not read header for file '%s'\n",
- opts->merged_input_name);
+ print_error("split", "Could not read header from \"%s\"", opts->merged_input_name);
cleanup_state(retval, false);
return NULL;
}
if (opts->unaccounted_header_name) {
samFile* hdr_load = sam_open_format(opts->unaccounted_header_name, "r", &opts->ga.in);
if (!hdr_load) {
- fprintf(pysam_stderr, "Could not open unaccounted header file (%s)\n", opts->unaccounted_header_name);
+ print_error_errno("split", "Could not open unaccounted header file \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_header = sam_hdr_read(hdr_load);
if (retval->unaccounted_header == NULL) {
- fprintf(pysam_stderr, "Could not read header for file '%s'\n",
- opts->unaccounted_header_name);
+ print_error("split", "Could not read header from \"%s\"", opts->unaccounted_header_name);
cleanup_state(retval, false);
return NULL;
}
retval->unaccounted_file = sam_open_format(opts->unaccounted_name, "wb", &opts->ga.out);
if (retval->unaccounted_file == NULL) {
- fprintf(pysam_stderr, "Could not open unaccounted output file: %s\n", opts->unaccounted_name);
+ print_error_errno("split", "Could not open unaccounted output file \"%s\"", opts->unaccounted_name);
cleanup_state(retval, false);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->unaccounted_file, HTS_OPT_THREAD_POOL, &retval->p);
}
// Open output files for RGs
retval->rg_output_header = (bam_hdr_t**)calloc(retval->output_count, sizeof(bam_hdr_t*));
retval->rg_hash = kh_init_c2i();
if (!retval->rg_output_file_name || !retval->rg_output_file || !retval->rg_output_header || !retval->rg_hash) {
- fprintf(pysam_stderr, "Could not allocate memory for output file array. Out of memory?");
+ print_error_errno("split", "Could not initialise output file array");
cleanup_state(retval, false);
return NULL;
}
char* dirsep = strrchr(opts->merged_input_name, '/');
char* input_base_name = strdup(dirsep? dirsep+1 : opts->merged_input_name);
if (!input_base_name) {
- fprintf(pysam_stderr, "Out of memory\n");
+ print_error_errno("split", "Filename manipulation failed");
cleanup_state(retval, false);
return NULL;
}
&opts->ga.out);
if ( output_filename == NULL ) {
- fprintf(pysam_stderr, "Error expanding output filename format string.\n");
+ print_error("split", "Error expanding output filename format string");
cleanup_state(retval, false);
free(input_base_name);
return NULL;
retval->rg_output_file_name[i] = output_filename;
retval->rg_output_file[i] = sam_open_format(output_filename, "wb", &opts->ga.out);
if (retval->rg_output_file[i] == NULL) {
- fprintf(pysam_stderr, "Could not open output file: %s\n", output_filename);
+ print_error_errno("split", "Could not open \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
}
+ if (retval->p.pool)
+ hts_set_opt(retval->rg_output_file[i], HTS_OPT_THREAD_POOL, &retval->p);
// Record index in hash
int ret;
// Set and edit header
retval->rg_output_header[i] = bam_hdr_dup(retval->merged_input_header);
- if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i]) ) {
- fprintf(pysam_stderr, "Could not rewrite header for file: %s\n", output_filename);
+ if ( !filter_header_rg(retval->rg_output_header[i], retval->rg_id[i], arg_list) ) {
+ print_error("split", "Could not rewrite header for \"%s\"", output_filename);
cleanup_state(retval, false);
free(input_base_name);
return NULL;
static bool split(state_t* state)
{
if (state->unaccounted_file && sam_hdr_write(state->unaccounted_file, state->unaccounted_header) != 0) {
- fprintf(pysam_stderr, "Could not write output file header\n");
+ print_error_errno("split", "Could not write output file header");
return false;
}
size_t i;
for (i = 0; i < state->output_count; i++) {
if (sam_hdr_write(state->rg_output_file[i], state->rg_output_header[i]) != 0) {
- fprintf(pysam_stderr, "Could not write output file header for '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write file header to \"%s\"", state->rg_output_file_name[i]);
return false;
}
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(pysam_stderr, "Could not read first input record\n");
+ print_error("split", "Could not read first input record");
return false;
}
}
// if found write to the appropriate untangled bam
int i = kh_val(state->rg_hash,iter);
if (sam_write1(state->rg_output_file[i], state->rg_output_header[i], file_read) < 0) {
- fprintf(pysam_stderr, "Could not write to output file '%s'\n",
- state->rg_output_file_name[i]);
+ print_error_errno("split", "Could not write to \"%s\"", state->rg_output_file_name[i]);
bam_destroy1(file_read);
return false;
}
return false;
} else {
if (sam_write1(state->unaccounted_file, state->unaccounted_header, file_read) < 0) {
- fprintf(pysam_stderr, "Could not write to unaccounted output file\n");
+ print_error_errno("split", "Could not write to unaccounted output file");
bam_destroy1(file_read);
return false;
}
bam_destroy1(file_read);
file_read = NULL;
if (r < -1) {
- fprintf(pysam_stderr, "Could not read input record\n");
+ print_error("split", "Could not read input record");
return false;
}
}
if (status->unaccounted_header) bam_hdr_destroy(status->unaccounted_header);
if (status->unaccounted_file) {
if (sam_close(status->unaccounted_file) < 0 && check_close) {
- fprintf(pysam_stderr, "Error on closing unaccounted file\n");
+ print_error("split", "Error on closing unaccounted file");
ret = -1;
}
}
bam_hdr_destroy(status->rg_output_header[i]);
if (status->rg_output_file && status->rg_output_file[i]) {
if (sam_close(status->rg_output_file[i]) < 0 && check_close) {
- fprintf(pysam_stderr, "Error on closing output file '%s'\n",
- status->rg_output_file_name[i]);
+ print_error("split", "Error on closing output file \"%s\"", status->rg_output_file_name[i]);
ret = -1;
}
}
free(status->rg_id);
free(status);
+ if (status->p.pool)
+ hts_tpool_destroy(status->p.pool);
+
return ret;
}
int main_split(int argc, char** argv)
{
int ret = 1;
+ char *arg_list = stringify_argv(argc+1, argv-1);
parsed_opts_t* opts = parse_args(argc, argv);
if (!opts) goto cleanup_opts;
- state_t* status = init(opts);
+ state_t* status = init(opts, arg_list);
if (!status) goto cleanup_opts;
if (!split(status)) {
cleanup_opts:
cleanup_opts(opts);
+ free(arg_list);
return ret;
}
#include "htslib/sam.h"
#include "samtools.h"
+#include "sam_opts.h"
typedef struct {
long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
static void usage_exit(FILE *fp, int exit_status)
{
- fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] <in.bam>\n");
+ fprintf(fp, "Usage: samtools flagstat [options] <in.bam>\n");
+ sam_global_opt_help(fp, "-.---@");
exit(exit_status);
}
bam_hdr_t *header;
bam_flagstat_t *s;
char b0[16], b1[16];
- hts_opt *in_opts = NULL;
int c;
enum {
INPUT_FMT_OPTION = CHAR_MAX+1,
};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION},
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
switch (c) {
- case INPUT_FMT_OPTION:
- if (hts_opt_add(&in_opts, optarg) < 0)
- usage_exit(stderr, EXIT_FAILURE);
- break;
- default:
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?':
usage_exit(stderr, EXIT_FAILURE);
}
}
if (argc == optind) usage_exit(stdout, EXIT_SUCCESS);
else usage_exit(stderr, EXIT_FAILURE);
}
- fp = sam_open(argv[optind], "r");
+ fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == NULL) {
print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]);
return 1;
}
- if (hts_opt_apply(fp, in_opts)) {
- fprintf(stderr, "Failed to apply input-fmt-options\n");
- return 1;
- }
+ if (ga.nthreads > 0)
+ hts_set_threads(fp, ga.nthreads);
if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
free(s);
bam_hdr_destroy(header);
sam_close(fp);
- hts_opt_free(in_opts);
+ sam_global_args_free(&ga);
return 0;
}
#include "htslib/sam.h"
#include "samtools.h"
+#include "sam_opts.h"
typedef struct {
long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2];
return buffer;
}
-static void usage_exit(FILE *fp, int exit_status)
+static int usage_exit(FILE *fp, int exit_status)
{
- fprintf(fp, "Usage: samtools flagstat [--input-fmt-option OPT=VAL] <in.bam>\n");
- exit(exit_status);
+ fprintf(fp, "Usage: samtools flagstat [options] <in.bam>\n");
+ sam_global_opt_help(fp, "-.---@");
+ return(exit_status);
}
int bam_flagstat(int argc, char *argv[])
bam_hdr_t *header;
bam_flagstat_t *s;
char b0[16], b1[16];
- hts_opt *in_opts = NULL;
int c;
enum {
INPUT_FMT_OPTION = CHAR_MAX+1,
};
+ sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- {"input-fmt-option", required_argument, NULL, INPUT_FMT_OPTION},
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', '-', '@'),
{NULL, 0, NULL, 0}
};
- while ((c = getopt_long(argc, argv, "", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "@:", lopts, NULL)) >= 0) {
switch (c) {
- case INPUT_FMT_OPTION:
- if (hts_opt_add(&in_opts, optarg) < 0)
- usage_exit(pysam_stderr, EXIT_FAILURE);
- break;
- default:
- usage_exit(pysam_stderr, EXIT_FAILURE);
+ default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
+ /* else fall-through */
+ case '?':
+ return(usage_exit(pysam_stderr, EXIT_FAILURE));
}
}
if (argc != optind+1) {
- if (argc == optind) usage_exit(pysam_stdout, EXIT_SUCCESS);
- else usage_exit(pysam_stderr, EXIT_FAILURE);
+ if (argc == optind) return(usage_exit(pysam_stdout, EXIT_SUCCESS));
+ else return(usage_exit(pysam_stderr, EXIT_FAILURE));
}
- fp = sam_open(argv[optind], "r");
+ fp = sam_open_format(argv[optind], "r", &ga.in);
if (fp == NULL) {
print_error_errno("flagstat", "Cannot open input file \"%s\"", argv[optind]);
return 1;
}
- if (hts_opt_apply(fp, in_opts)) {
- fprintf(pysam_stderr, "Failed to apply input-fmt-options\n");
- return 1;
- }
+ if (ga.nthreads > 0)
+ hts_set_threads(fp, ga.nthreads);
if (hts_set_opt(fp, CRAM_OPT_REQUIRED_FIELDS,
SAM_FLAG | SAM_MAPQ | SAM_RNEXT)) {
free(s);
bam_hdr_destroy(header);
sam_close(fp);
- hts_opt_free(in_opts);
+ sam_global_args_free(&ga);
return 0;
}
+++ /dev/null
-/* bam_tview.c -- tview subcommand.
-
- Copyright (C) 2008-2015 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <regex.h>
-#include <assert.h>
-#include "bam_tview.h"
-#include <htslib/faidx.h>
-#include <htslib/sam.h>
-#include <htslib/bgzf.h>
-#include "sam_opts.h"
-
-khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
-{
- khash_t(kh_rg)* rg_hash = kh_init(kh_rg);
- // given sample id return all the RD ID's
- const char rg_regex[] = "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)";
-
- regex_t rg_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- regcomp(&rg_id, rg_regex, REG_EXTENDED|REG_NEWLINE);
- char* text = strdup(header);
- char* end = text + strlen(header);
- char* tofree = text;
- while (end > text && regexec(&rg_id, text, 2, matches, 0) == 0) { // foreach rg id in header
- int ret;
- text[matches[1].rm_eo] = '\0';
- kh_put(kh_rg, rg_hash, strdup(text+matches[1].rm_so), &ret); // Add the RG to the list
- text += matches[0].rm_eo + 1; // Move search pointer forward
- }
- free(tofree);
- return rg_hash;
-}
-
-int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt)
-{
- assert(tv!=NULL);
- assert(fn!=NULL);
- tv->mrow = 24; tv->mcol = 80;
- tv->color_for = TV_COLOR_MAPQ;
- tv->is_dot = 1;
-
- tv->fp = sam_open_format(fn, "r", fmt);
- if(tv->fp == NULL)
- {
- fprintf(stderr,"sam_open %s. %s\n", fn,fn_fa);
- exit(EXIT_FAILURE);
- }
- // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
- assert(tv->fp);
-
- tv->header = sam_hdr_read(tv->fp);
- if(tv->header == NULL)
- {
- fprintf(stderr,"Cannot read '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->idx = sam_index_load(tv->fp, fn);
- if (tv->idx == NULL)
- {
- fprintf(stderr,"Cannot read index for '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
- if (fn_fa) tv->fai = fai_load(fn_fa);
- tv->bca = bcf_call_init(0.83, 13);
- tv->ins = 1;
-
- // If the user has asked for specific samples find out create a list of readgroups make up these samples
- if ( samples )
- {
- tv->rg_hash = get_rg_sample(tv->header->text, samples); // Init the list of rg's
- }
-
- return 0;
-}
-
-
-void base_tv_destroy(tview_t* tv)
-{
- bam_lplbuf_destroy(tv->lplbuf);
- bcf_call_destroy(tv->bca);
- hts_idx_destroy(tv->idx);
- if (tv->fai) fai_destroy(tv->fai);
- free(tv->ref);
- bam_hdr_destroy(tv->header);
- sam_close(tv->fp);
-}
-
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- tview_t *tv = (tview_t*)data;
- int i, j, c, rb, attr, max_ins = 0;
- uint32_t call = 0;
- if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
- // print reference
- rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
- for (i = tv->last_pos + 1; i < pos; ++i) {
- if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1);
- c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- { // call consensus
- bcf_callret1_t bcr;
- memset(&bcr, 0, sizeof bcr);
- int qsum[4], a1, a2, tmp;
- double p[3], prior = 30;
- bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr);
- for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i;
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
- tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
- a1 = qsum[0]&3; a2 = qsum[1]&3;
- p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
- if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
- if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
- if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
- else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
- else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
- }
- attr = tv->my_underline(tv);
- c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
- i = (call&0xffff)/10+1;
- if (i > 4) i = 4;
- attr |= tv->my_colorpair(tv,i);
- if (c == toupper(rb)) c = '.';
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,2, tv->ccol, c);
- tv->my_attroff(tv,attr);
- if(tv->ins) {
- // calculate maximum insert
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
- }
- }
- // core loop
- for (j = 0; j <= max_ins; ++j) {
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- int row = TV_MIN_ALNROW + p->level - tv->row_shift;
- if (j == 0) {
- if (!p->is_del) {
- if (tv->base_for == TV_BASE_COLOR_SPACE &&
- (c = bam_aux_getCSi(p->b, p->qpos))) {
- // assume that if we found one color, we will be able to get the color error
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.';
- } else {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
- if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*';
- } else { // padding
- if (j > p->indel) c = '*';
- else { // insertion
- if (tv->base_for == TV_BASE_NUCL) {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
- if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- } else {
- c = bam_aux_getCSi(p->b, p->qpos + j);
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- }
- if (row > TV_MIN_ALNROW && row < tv->mrow) {
- int x;
- attr = 0;
- if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
- || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
- if (tv->color_for == TV_COLOR_BASEQ) {
- x = bam_get_qual(p->b)[p->qpos]/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_MAPQ) {
- x = p->b->core.qual/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_NUCL) {
- x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COL) {
- x = 0;
- switch(bam_aux_getCSi(p->b, p->qpos)) {
- case '0': x = 0; break;
- case '1': x = 1; break;
- case '2': x = 2; break;
- case '3': x = 3; break;
- case '4': x = 4; break;
- default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
- }
- x+=5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COLQ) {
- x = bam_aux_getCQi(p->b, p->qpos);
- if(0 == x) x = bam_get_qual(p->b)[p->qpos];
- x = x/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- }
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c));
- tv->my_attroff(tv,attr);
- }
- }
- c = j? '*' : rb;
- if (c == '*') {
- attr = tv->my_colorpair(tv,8);
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- tv->my_attroff(tv,attr);
- } else tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- tv->last_pos = pos;
- return 0;
-}
-
-
-
-
-static int tv_push_aln(const bam1_t *b, tview_t *tv)
-{
- /* If we are restricted to specific readgroups check RG is in the list */
- if ( tv->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(b, "RG");
- if ( !rg ) return 0; // If we don't have an RG tag exclude read
- khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
- if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read
- }
- if (tv->no_skip) {
- uint32_t *cigar = bam_get_cigar(b); // this is cheating...
- int i;
- for (i = 0; i <b->core.n_cigar; ++i) {
- if ((cigar[i]&0xf) == BAM_CREF_SKIP)
- cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
- }
- }
- bam_lplbuf_push(b, tv->lplbuf);
- return 0;
-}
-
-int base_draw_aln(tview_t *tv, int tid, int pos)
-{
- assert(tv!=NULL);
- // reset
- tv->my_clear(tv);
- tv->curr_tid = tid; tv->left_pos = pos;
- tv->last_pos = tv->left_pos - 1;
- tv->ccol = 0;
- // print ref and consensus
- if (tv->fai) {
- char *str;
- if (tv->ref) free(tv->ref);
- assert(tv->curr_tid>=0);
-
- str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
- assert(str!=NULL);
- sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
- tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
- free(str);
- if ( !tv->ref )
- {
- fprintf(stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
- exit(1);
- }
- }
- // draw aln
- bam_lplbuf_reset(tv->lplbuf);
- hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
- bam1_t *b = bam_init1();
- while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv);
- bam_destroy1(b);
- hts_itr_destroy(iter);
- bam_lplbuf_push(0, tv->lplbuf);
-
- while (tv->ccol < tv->mcol) {
- int pos = tv->last_pos + 1;
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
- ++tv->last_pos;
- }
- return 0;
-}
-
-
-
-
-static void error(const char *format, ...)
-{
- if ( !format )
- {
- fprintf(stderr,
-"Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
-"Options:\n"
-" -d display output as (H)tml or (C)urses or (T)ext \n"
-" -p chr:pos go directly to this position\n"
-" -s STR display only reads from this sample or group\n");
- sam_global_opt_help(stderr, "-.--.");
- }
- else
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- }
- exit(-1);
-}
-
-enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-
-int bam_tview_main(int argc, char *argv[])
-{
- int view_mode=display_ncurses;
- tview_t* tv=NULL;
- char *samples=NULL, *position=NULL, *ref;
- int c;
-
- sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
- { NULL, 0, NULL, 0 }
- };
-
- while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) {
- switch (c) {
- case 's': samples=optarg; break;
- case 'p': position=optarg; break;
- case 'd':
- {
- switch(optarg[0])
- {
- case 'H': case 'h': view_mode=display_html;break;
- case 'T': case 't': view_mode=display_text;break;
- case 'C': case 'c': view_mode=display_ncurses;break;
- default: view_mode=display_ncurses;break;
- }
- break;
- }
- default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': error(NULL);
- }
- }
- if (argc==optind) error(NULL);
-
- ref = (optind+1>=argc)? ga.reference : argv[optind+1];
-
- switch(view_mode)
- {
- case display_ncurses:
- tv = curses_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_text:
- tv = text_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_html:
- tv = html_tv_init(argv[optind], ref, samples, &ga.in);
- break;
- }
- if (tv==NULL)
- {
- error("cannot create view");
- return EXIT_FAILURE;
- }
-
- if ( position )
- {
- int tid, beg, end;
- char *name_lim = (char *) hts_parse_reg(position, &beg, &end);
- if (name_lim) *name_lim = '\0';
- else beg = 0; // region parsing failed, but possibly a seq named "foo:a"
- tid = bam_name2id(tv->header, position);
- if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; }
- }
- else if ( tv->fai )
- {
- // find the first sequence present in both BAM and the reference file
- int i;
- for (i=0; i<tv->header->n_targets; i++)
- {
- if ( faidx_has_seq(tv->fai, tv->header->target_name[i]) ) break;
- }
- if ( i==tv->header->n_targets )
- {
- fprintf(stderr,"None of the BAM sequence names present in the fasta file\n");
- exit(EXIT_FAILURE);
- }
- tv->curr_tid = i;
- }
- tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- tv->my_loop(tv);
- tv->my_destroy(tv);
-
- return EXIT_SUCCESS;
-}
+++ /dev/null
-#include "pysam.h"
-
-/* bam_tview.c -- tview subcommand.
-
- Copyright (C) 2008-2015 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <regex.h>
-#include <assert.h>
-#include "bam_tview.h"
-#include <htslib/faidx.h>
-#include <htslib/sam.h>
-#include <htslib/bgzf.h>
-#include "sam_opts.h"
-
-khash_t(kh_rg)* get_rg_sample(const char* header, const char* sample)
-{
- khash_t(kh_rg)* rg_hash = kh_init(kh_rg);
- // given sample id return all the RD ID's
- const char rg_regex[] = "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)";
-
- regex_t rg_id;
- regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t));
- if (matches == NULL) { perror("out of memory"); exit(-1); }
- regcomp(&rg_id, rg_regex, REG_EXTENDED|REG_NEWLINE);
- char* text = strdup(header);
- char* end = text + strlen(header);
- char* tofree = text;
- while (end > text && regexec(&rg_id, text, 2, matches, 0) == 0) { // foreach rg id in header
- int ret;
- text[matches[1].rm_eo] = '\0';
- kh_put(kh_rg, rg_hash, strdup(text+matches[1].rm_so), &ret); // Add the RG to the list
- text += matches[0].rm_eo + 1; // Move search pointer forward
- }
- free(tofree);
- return rg_hash;
-}
-
-int base_tv_init(tview_t* tv, const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt)
-{
- assert(tv!=NULL);
- assert(fn!=NULL);
- tv->mrow = 24; tv->mcol = 80;
- tv->color_for = TV_COLOR_MAPQ;
- tv->is_dot = 1;
-
- tv->fp = sam_open_format(fn, "r", fmt);
- if(tv->fp == NULL)
- {
- fprintf(pysam_stderr,"sam_open %s. %s\n", fn,fn_fa);
- exit(EXIT_FAILURE);
- }
- // TODO bgzf_set_cache_size(tv->fp->fp.bgzf, 8 * 1024 *1024);
- assert(tv->fp);
-
- tv->header = sam_hdr_read(tv->fp);
- if(tv->header == NULL)
- {
- fprintf(pysam_stderr,"Cannot read '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->idx = sam_index_load(tv->fp, fn);
- if (tv->idx == NULL)
- {
- fprintf(pysam_stderr,"Cannot read index for '%s'.\n", fn);
- exit(EXIT_FAILURE);
- }
- tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
- if (fn_fa) tv->fai = fai_load(fn_fa);
- tv->bca = bcf_call_init(0.83, 13);
- tv->ins = 1;
-
- // If the user has asked for specific samples find out create a list of readgroups make up these samples
- if ( samples )
- {
- tv->rg_hash = get_rg_sample(tv->header->text, samples); // Init the list of rg's
- }
-
- return 0;
-}
-
-
-void base_tv_destroy(tview_t* tv)
-{
- bam_lplbuf_destroy(tv->lplbuf);
- bcf_call_destroy(tv->bca);
- hts_idx_destroy(tv->idx);
- if (tv->fai) fai_destroy(tv->fai);
- free(tv->ref);
- bam_hdr_destroy(tv->header);
- sam_close(tv->fp);
-}
-
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
-{
- tview_t *tv = (tview_t*)data;
- int i, j, c, rb, attr, max_ins = 0;
- uint32_t call = 0;
- if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
- // print reference
- rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
- for (i = tv->last_pos + 1; i < pos; ++i) {
- if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1);
- c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- { // call consensus
- bcf_callret1_t bcr;
- memset(&bcr, 0, sizeof bcr);
- int qsum[4], a1, a2, tmp;
- double p[3], prior = 30;
- bcf_call_glfgen(n, pl, seq_nt16_table[rb], tv->bca, &bcr);
- for (i = 0; i < 4; ++i) qsum[i] = ((int)bcr.qsum[i])<<2 | i;
- for (i = 1; i < 4; ++i) // insertion sort
- for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j)
- tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
- a1 = qsum[0]&3; a2 = qsum[1]&3;
- p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2];
- if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3;
- if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3;
- if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499);
- else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499);
- else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499);
- }
- attr = tv->my_underline(tv);
- c = ",ACMGRSVTWYHKDBN"[call>>16&0xf];
- i = (call&0xffff)/10+1;
- if (i > 4) i = 4;
- attr |= tv->my_colorpair(tv,i);
- if (c == toupper(rb)) c = '.';
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,2, tv->ccol, c);
- tv->my_attroff(tv,attr);
- if(tv->ins) {
- // calculate maximum insert
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
- }
- }
- // core loop
- for (j = 0; j <= max_ins; ++j) {
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pl + i;
- int row = TV_MIN_ALNROW + p->level - tv->row_shift;
- if (j == 0) {
- if (!p->is_del) {
- if (tv->base_for == TV_BASE_COLOR_SPACE &&
- (c = bam_aux_getCSi(p->b, p->qpos))) {
- // assume that if we found one color, we will be able to get the color error
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam_is_rev(p->b)? ',' : '.';
- } else {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)];
- if (tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- } else c = p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*';
- } else { // padding
- if (j > p->indel) c = '*';
- else { // insertion
- if (tv->base_for == TV_BASE_NUCL) {
- if (tv->show_name) {
- char *name = bam_get_qname(p->b);
- c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j];
- } else {
- c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)];
- if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam_is_rev(p->b)? ',' : '.';
- }
- } else {
- c = bam_aux_getCSi(p->b, p->qpos + j);
- if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam_is_rev(p->b)? ',' : '.';
- }
- }
- }
- if (row > TV_MIN_ALNROW && row < tv->mrow) {
- int x;
- attr = 0;
- if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
- || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv);
- if (tv->color_for == TV_COLOR_BASEQ) {
- x = bam_get_qual(p->b)[p->qpos]/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_MAPQ) {
- x = p->b->core.qual/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- } else if (tv->color_for == TV_COLOR_NUCL) {
- x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)] + 5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COL) {
- x = 0;
- switch(bam_aux_getCSi(p->b, p->qpos)) {
- case '0': x = 0; break;
- case '1': x = 1; break;
- case '2': x = 2; break;
- case '3': x = 3; break;
- case '4': x = 4; break;
- default: x = seq_nt16_int[bam_seqi(bam_get_seq(p->b), p->qpos)]; break;
- }
- x+=5;
- attr |= tv->my_colorpair(tv,x);
- } else if(tv->color_for == TV_COLOR_COLQ) {
- x = bam_aux_getCQi(p->b, p->qpos);
- if(0 == x) x = bam_get_qual(p->b)[p->qpos];
- x = x/10 + 1;
- if (x > 4) x = 4;
- attr |= tv->my_colorpair(tv,x);
- }
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,row, tv->ccol, bam_is_rev(p->b)? tolower(c) : toupper(c));
- tv->my_attroff(tv,attr);
- }
- }
- c = j? '*' : rb;
- if (c == '*') {
- attr = tv->my_colorpair(tv,8);
- tv->my_attron(tv,attr);
- tv->my_mvaddch(tv,1, tv->ccol++, c);
- tv->my_attroff(tv,attr);
- } else tv->my_mvaddch(tv,1, tv->ccol++, c);
- }
- tv->last_pos = pos;
- return 0;
-}
-
-
-
-
-static int tv_push_aln(const bam1_t *b, tview_t *tv)
-{
- /* If we are restricted to specific readgroups check RG is in the list */
- if ( tv->rg_hash )
- {
- const uint8_t *rg = bam_aux_get(b, "RG");
- if ( !rg ) return 0; // If we don't have an RG tag exclude read
- khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
- if ( k == kh_end(tv->rg_hash) ) return 0; // if RG tag is not in list of allowed tags exclude read
- }
- if (tv->no_skip) {
- uint32_t *cigar = bam_get_cigar(b); // this is cheating...
- int i;
- for (i = 0; i <b->core.n_cigar; ++i) {
- if ((cigar[i]&0xf) == BAM_CREF_SKIP)
- cigar[i] = cigar[i]>>4<<4 | BAM_CDEL;
- }
- }
- bam_lplbuf_push(b, tv->lplbuf);
- return 0;
-}
-
-int base_draw_aln(tview_t *tv, int tid, int pos)
-{
- assert(tv!=NULL);
- // reset
- tv->my_clear(tv);
- tv->curr_tid = tid; tv->left_pos = pos;
- tv->last_pos = tv->left_pos - 1;
- tv->ccol = 0;
- // print ref and consensus
- if (tv->fai) {
- char *str;
- if (tv->ref) free(tv->ref);
- assert(tv->curr_tid>=0);
-
- str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
- assert(str!=NULL);
- sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
- tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
- free(str);
- if ( !tv->ref )
- {
- fprintf(pysam_stderr,"Could not read the reference sequence. Is it seekable (plain text or compressed + .gzi indexed with bgzip)?\n");
- exit(1);
- }
- }
- // draw aln
- bam_lplbuf_reset(tv->lplbuf);
- hts_itr_t *iter = sam_itr_queryi(tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol);
- bam1_t *b = bam_init1();
- while (sam_itr_next(tv->fp, iter, b) >= 0) tv_push_aln(b, tv);
- bam_destroy1(b);
- hts_itr_destroy(iter);
- bam_lplbuf_push(0, tv->lplbuf);
-
- while (tv->ccol < tv->mcol) {
- int pos = tv->last_pos + 1;
- if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1);
- tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N');
- ++tv->last_pos;
- }
- return 0;
-}
-
-
-
-
-static void error(const char *format, ...)
-{
- if ( !format )
- {
- fprintf(pysam_stderr,
-"Usage: samtools tview [options] <aln.bam> [ref.fasta]\n"
-"Options:\n"
-" -d display output as (H)tml or (C)urses or (T)ext \n"
-" -p chr:pos go directly to this position\n"
-" -s STR display only reads from this sample or group\n");
- sam_global_opt_help(pysam_stderr, "-.--.");
- }
- else
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(pysam_stderr, format, ap);
- va_end(ap);
- }
- exit(-1);
-}
-
-enum dipsay_mode {display_ncurses,display_html,display_text};
-extern tview_t* curses_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* html_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-
-int bam_tview_main(int argc, char *argv[])
-{
- int view_mode=display_ncurses;
- tview_t* tv=NULL;
- char *samples=NULL, *position=NULL, *ref;
- int c;
-
- sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
- static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
- { NULL, 0, NULL, 0 }
- };
-
- while ((c = getopt_long(argc, argv, "s:p:d:", lopts, NULL)) >= 0) {
- switch (c) {
- case 's': samples=optarg; break;
- case 'p': position=optarg; break;
- case 'd':
- {
- switch(optarg[0])
- {
- case 'H': case 'h': view_mode=display_html;break;
- case 'T': case 't': view_mode=display_text;break;
- case 'C': case 'c': view_mode=display_ncurses;break;
- default: view_mode=display_ncurses;break;
- }
- break;
- }
- default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
- /* else fall-through */
- case '?': error(NULL);
- }
- }
- if (argc==optind) error(NULL);
-
- ref = (optind+1>=argc)? ga.reference : argv[optind+1];
-
- switch(view_mode)
- {
- case display_ncurses:
- tv = curses_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_text:
- tv = text_tv_init(argv[optind], ref, samples, &ga.in);
- break;
-
- case display_html:
- tv = html_tv_init(argv[optind], ref, samples, &ga.in);
- break;
- }
- if (tv==NULL)
- {
- error("cannot create view");
- return EXIT_FAILURE;
- }
-
- if ( position )
- {
- int tid, beg, end;
- char *name_lim = (char *) hts_parse_reg(position, &beg, &end);
- if (name_lim) *name_lim = '\0';
- else beg = 0; // region parsing failed, but possibly a seq named "foo:a"
- tid = bam_name2id(tv->header, position);
- if (tid >= 0) { tv->curr_tid = tid; tv->left_pos = beg; }
- }
- else if ( tv->fai )
- {
- // find the first sequence present in both BAM and the reference file
- int i;
- for (i=0; i<tv->header->n_targets; i++)
- {
- if ( faidx_has_seq(tv->fai, tv->header->target_name[i]) ) break;
- }
- if ( i==tv->header->n_targets )
- {
- fprintf(pysam_stderr,"None of the BAM sequence names present in the fasta file\n");
- exit(EXIT_FAILURE);
- }
- tv->curr_tid = i;
- }
- tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- tv->my_loop(tv);
- tv->my_destroy(tv);
-
- return EXIT_SUCCESS;
-}
+++ /dev/null
-/* bam_tview.h -- tview subcommand.
-
- Copyright (C) 2008, 2013 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#ifndef BAM_TVIEW_H
-#define BAM_TVIEW_H
-
-#include <ctype.h>
-#include <string.h>
-#include <math.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <htslib/sam.h>
-#include "bam2bcf.h"
-#include <htslib/khash.h>
-#include <htslib/hts.h>
-#include <htslib/faidx.h>
-#include "bam_lpileup.h"
-
-
-KHASH_MAP_INIT_STR(kh_rg, const char *)
-
-/* Holds state of Tview */
-typedef struct AbstractTview {
- int mrow, mcol;
-
- hts_idx_t* idx;
- bam_lplbuf_t* lplbuf;
- bam_hdr_t* header;
- samFile* fp;
- int curr_tid, left_pos;
- faidx_t* fai;
- bcf_callaux_t* bca;
-
- int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins;
- int no_skip, show_name, inverse;
- char *ref;
- /* maps @RG ID => SM (sample), in practice only used to determine whether a particular RG is in the list of allowed ones */
- khash_t(kh_rg) *rg_hash;
- /* callbacks */
- void (*my_destroy)(struct AbstractTview* );
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_mvaddch)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
- int (*my_drawaln)(struct AbstractTview*,int,int);
- int (*my_loop)(struct AbstractTview*);
- int (*my_underline)(struct AbstractTview*);
-} tview_t;
-
-
-char bam_aux_getCEi(bam1_t *b, int i);
-char bam_aux_getCSi(bam1_t *b, int i);
-char bam_aux_getCQi(bam1_t *b, int i);
-
-#define TV_MIN_ALNROW 2
-#define TV_MAX_GOTO 40
-#define TV_LOW_MAPQ 10
-
-#define TV_COLOR_MAPQ 0
-#define TV_COLOR_BASEQ 1
-#define TV_COLOR_NUCL 2
-#define TV_COLOR_COL 3
-#define TV_COLOR_COLQ 4
-
-#define TV_BASE_NUCL 0
-#define TV_BASE_COLOR_SPACE 1
-
-int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
-int base_tv_init(tview_t*,const char *fn, const char *fn_fa,
- const char *samples, const htsFormat *fmt);
-void base_tv_destroy(tview_t*);
-int base_draw_aln(tview_t *tv, int tid, int pos);
-
-typedef struct Tixel
- {
- int ch;
- int attributes;
- }tixel_t;
-
-#endif
-
+++ /dev/null
-/* bam_tview_curses.c -- curses tview implementation.
-
- Copyright (C) 2008-2013 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include "bam_tview.h"
-
-#ifdef HAVE_CURSES
-
-#if defined HAVE_NCURSESW_CURSES_H
-#include <ncursesw/curses.h>
-#elif defined HAVE_NCURSESW_H
-#include <ncursesw.h>
-#elif defined HAVE_NCURSES_CURSES_H
-#include <ncurses/curses.h>
-#elif defined HAVE_NCURSES_H
-#include <ncurses.h>
-#elif defined HAVE_CURSES_H
-#include <curses.h>
-#endif
-
-typedef struct CursesTview {
- tview_t view;
- WINDOW *wgoto, *whelp;
- } curses_tview_t;
-
-#define FROM_TV(ptr) ((curses_tview_t*)ptr)
-
-static void curses_destroy(tview_t* base)
- {
- curses_tview_t* tv=(curses_tview_t*)base;
-
-
- delwin(tv->wgoto); delwin(tv->whelp);
- endwin();
-
- base_tv_destroy(base);
-
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
- mvprintw(y,x,str);
- free(str);
- }
-
-static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- mvaddch(y,x,ch);
- }
-
-static void curses_attron(struct AbstractTview* tv,int flag)
- {
- attron(flag);
- }
-static void curses_attroff(struct AbstractTview* tv,int flag)
- {
- attroff(flag);
- }
-static void curses_clear(struct AbstractTview* tv)
- {
- clear();
- }
-
-static int curses_init_colors(int inverse)
-{
- if (inverse) {
- init_pair(1, COLOR_WHITE, COLOR_BLUE);
- init_pair(2, COLOR_BLACK, COLOR_GREEN);
- init_pair(3, COLOR_BLACK, COLOR_YELLOW);
- init_pair(4, COLOR_BLACK, COLOR_WHITE);
- init_pair(5, COLOR_BLACK, COLOR_GREEN);
- init_pair(6, COLOR_BLACK, COLOR_CYAN);
- init_pair(7, COLOR_WHITE, COLOR_MAGENTA);
- init_pair(8, COLOR_WHITE, COLOR_RED);
- init_pair(9, COLOR_WHITE, COLOR_BLUE);
- } else {
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_MAGENTA, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
- }
-
- return 0;
-}
-
-static int curses_colorpair(struct AbstractTview* tv,int flag)
- {
- return COLOR_PAIR(flag);
- }
-
-static int curses_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- return base_draw_aln(tv, tid, pos);
- }
-
-
-
-static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
- {
- char str[256], *p;
- int i, l = 0;
- tview_t *base=(tview_t*)tv;
- wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(tv->wgoto, 1, 2, "Goto: ");
- for (;;) {
- int invalid = 0;
- int c = wgetch(tv->wgoto);
- wrefresh(tv->wgoto);
- if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
- if(l > 0) --l;
- } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
- int _tid = -1, _beg, _end;
- if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10) - 1;
- if (_beg > 0) {
- *pos = _beg;
- return;
- }
- } else {
- char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end);
- if (name_lim) {
- char name_terminator = *name_lim;
- *name_lim = '\0';
- _tid = bam_name2id(base->header, str);
- *name_lim = name_terminator;
- }
- else {
- // Unparsable region, but possibly a sequence named "foo:a"
- _tid = bam_name2id(base->header, str);
- _beg = 0;
- }
-
- if (_tid >= 0) {
- *tid = _tid; *pos = _beg;
- return;
- }
- }
-
- // If we get here, the region string is invalid
- invalid = 1;
- } else if (isgraph(c)) {
- if (l < TV_MAX_GOTO) str[l++] = c;
- } else if (c == '\027') l = 0;
- else if (c == '\033') return;
- str[l] = '\0';
- for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
- if (invalid) mvwprintw(tv->wgoto, 1, TV_MAX_GOTO - 1, "[Invalid]");
- mvwprintw(tv->wgoto, 1, 8, "%s", str);
- }
-}
-
-
-
-
-static void tv_win_help(curses_tview_t *tv) {
- int r = 1;
- tview_t* base=(tview_t*)base;
- WINDOW *win = tv->whelp;
- wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(win, r++, 2, " -=- Help -=- ");
- r++;
- mvwprintw(win, r++, 2, "? This window");
- mvwprintw(win, r++, 2, "Arrows Small scroll movement");
- mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
- mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
- mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
- mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
- mvwprintw(win, r++, 2, "space Scroll one screen");
- mvwprintw(win, r++, 2, "backspace Scroll back one screen");
- mvwprintw(win, r++, 2, "g Go to specific location");
- mvwprintw(win, r++, 2, "m Color for mapping qual");
- mvwprintw(win, r++, 2, "n Color for nucleotide");
- mvwprintw(win, r++, 2, "b Color for base quality");
- mvwprintw(win, r++, 2, "c Color for cs color");
- mvwprintw(win, r++, 2, "z Color for cs qual");
- mvwprintw(win, r++, 2, ". Toggle on/off dot view");
- mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
- mvwprintw(win, r++, 2, "r Toggle on/off rd name");
- mvwprintw(win, r++, 2, "N Turn on nt view");
- mvwprintw(win, r++, 2, "C Turn on cs view");
- mvwprintw(win, r++, 2, "i Toggle on/off ins");
- mvwprintw(win, r++, 2, "v Inverse video");
- mvwprintw(win, r++, 2, "q Exit");
- r++;
- mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
- mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
- mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
- wrefresh(win);
- wgetch(win);
-}
-
-static int curses_underline(tview_t* tv)
- {
- return A_UNDERLINE;
- }
-
-static int curses_loop(tview_t* tv)
- {
- int tid, pos;
- curses_tview_t *CTV=(curses_tview_t *)tv;
- tid = tv->curr_tid; pos = tv->left_pos;
- while (1) {
- int c = getch();
- switch (c) {
- case '?': tv_win_help(CTV); break;
- case '\033':
- case 'q': goto end_loop;
- case '/':
- case 'g': tv_win_goto(CTV, &tid, &pos); break;
- case 'm': tv->color_for = TV_COLOR_MAPQ; break;
- case 'b': tv->color_for = TV_COLOR_BASEQ; break;
- case 'n': tv->color_for = TV_COLOR_NUCL; break;
- case 'c': tv->color_for = TV_COLOR_COL; break;
- case 'z': tv->color_for = TV_COLOR_COLQ; break;
- case 'v': curses_init_colors(tv->inverse = !tv->inverse); break;
- case 's': tv->no_skip = !tv->no_skip; break;
- case 'r': tv->show_name = !tv->show_name; break;
- case KEY_LEFT:
- case 'h': --pos; break;
- case KEY_RIGHT:
- case 'l': ++pos; break;
- case KEY_SLEFT:
- case 'H': pos -= 20; break;
- case KEY_SRIGHT:
- case 'L': pos += 20; break;
- case '.': tv->is_dot = !tv->is_dot; break;
- case 'N': tv->base_for = TV_BASE_NUCL; break;
- case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
- case 'i': tv->ins = !tv->ins; break;
- case '\010': pos -= 1000; break;
- case '\014': pos += 1000; break;
- case ' ': pos += tv->mcol; break;
- case KEY_UP:
- case 'j': --tv->row_shift; break;
- case KEY_DOWN:
- case 'k': ++tv->row_shift; break;
- case KEY_BACKSPACE:
- case '\177': pos -= tv->mcol; break;
- case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
- default: continue;
- }
- if (pos < 0) pos = 0;
- if (tv->row_shift < 0) tv->row_shift = 0;
- tv->my_drawaln(tv, tid, pos);
- }
-end_loop:
- return 0;
-}
-
-
-
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(stderr,"Calloc failed\n");
- return 0;
- }
-
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
- initscr();
- keypad(stdscr, TRUE);
- clear();
- noecho();
- cbreak();
-
- getmaxyx(stdscr, base->mrow, base->mcol);
- tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(30, 40, 5, 5);
-
- start_color();
- curses_init_colors(0);
- return base;
- }
-
-#else // !HAVE_CURSES
-
-#warning "No curses library is available; tview with curses is disabled."
-
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt);
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- return text_tv_init(fn,fn_fa,samples,fmt);
- }
-
-#endif
+++ /dev/null
-#include "pysam.h"
-
-/* bam_tview_curses.c -- curses tview implementation.
-
- Copyright (C) 2008-2013 Genome Research Ltd.
- Portions copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notices and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include "bam_tview.h"
-
-#ifdef HAVE_CURSES
-
-#if defined HAVE_NCURSESW_CURSES_H
-#include <ncursesw/curses.h>
-#elif defined HAVE_NCURSESW_H
-#include <ncursesw.h>
-#elif defined HAVE_NCURSES_CURSES_H
-#include <ncurses/curses.h>
-#elif defined HAVE_NCURSES_H
-#include <ncurses.h>
-#elif defined HAVE_CURSES_H
-#include <curses.h>
-#endif
-
-typedef struct CursesTview {
- tview_t view;
- WINDOW *wgoto, *whelp;
- } curses_tview_t;
-
-#define FROM_TV(ptr) ((curses_tview_t*)ptr)
-
-static void curses_destroy(tview_t* base)
- {
- curses_tview_t* tv=(curses_tview_t*)base;
-
-
- delwin(tv->wgoto); delwin(tv->whelp);
- endwin();
-
- base_tv_destroy(base);
-
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
- mvprintw(y,x,str);
- free(str);
- }
-
-static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- mvaddch(y,x,ch);
- }
-
-static void curses_attron(struct AbstractTview* tv,int flag)
- {
- attron(flag);
- }
-static void curses_attroff(struct AbstractTview* tv,int flag)
- {
- attroff(flag);
- }
-static void curses_clear(struct AbstractTview* tv)
- {
- clear();
- }
-
-static int curses_init_colors(int inverse)
-{
- if (inverse) {
- init_pair(1, COLOR_WHITE, COLOR_BLUE);
- init_pair(2, COLOR_BLACK, COLOR_GREEN);
- init_pair(3, COLOR_BLACK, COLOR_YELLOW);
- init_pair(4, COLOR_BLACK, COLOR_WHITE);
- init_pair(5, COLOR_BLACK, COLOR_GREEN);
- init_pair(6, COLOR_BLACK, COLOR_CYAN);
- init_pair(7, COLOR_WHITE, COLOR_MAGENTA);
- init_pair(8, COLOR_WHITE, COLOR_RED);
- init_pair(9, COLOR_WHITE, COLOR_BLUE);
- } else {
- init_pair(1, COLOR_BLUE, COLOR_BLACK);
- init_pair(2, COLOR_GREEN, COLOR_BLACK);
- init_pair(3, COLOR_YELLOW, COLOR_BLACK);
- init_pair(4, COLOR_WHITE, COLOR_BLACK);
- init_pair(5, COLOR_GREEN, COLOR_BLACK);
- init_pair(6, COLOR_CYAN, COLOR_BLACK);
- init_pair(7, COLOR_MAGENTA, COLOR_BLACK);
- init_pair(8, COLOR_RED, COLOR_BLACK);
- init_pair(9, COLOR_BLUE, COLOR_BLACK);
- }
-
- return 0;
-}
-
-static int curses_colorpair(struct AbstractTview* tv,int flag)
- {
- return COLOR_PAIR(flag);
- }
-
-static int curses_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- return base_draw_aln(tv, tid, pos);
- }
-
-
-
-static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos)
- {
- char str[256], *p;
- int i, l = 0;
- tview_t *base=(tview_t*)tv;
- wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(tv->wgoto, 1, 2, "Goto: ");
- for (;;) {
- int invalid = 0;
- int c = wgetch(tv->wgoto);
- wrefresh(tv->wgoto);
- if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
- if(l > 0) --l;
- } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
- int _tid = -1, _beg, _end;
- if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10) - 1;
- if (_beg > 0) {
- *pos = _beg;
- return;
- }
- } else {
- char *name_lim = (char *) hts_parse_reg(str, &_beg, &_end);
- if (name_lim) {
- char name_terminator = *name_lim;
- *name_lim = '\0';
- _tid = bam_name2id(base->header, str);
- *name_lim = name_terminator;
- }
- else {
- // Unparsable region, but possibly a sequence named "foo:a"
- _tid = bam_name2id(base->header, str);
- _beg = 0;
- }
-
- if (_tid >= 0) {
- *tid = _tid; *pos = _beg;
- return;
- }
- }
-
- // If we get here, the region string is invalid
- invalid = 1;
- } else if (isgraph(c)) {
- if (l < TV_MAX_GOTO) str[l++] = c;
- } else if (c == '\027') l = 0;
- else if (c == '\033') return;
- str[l] = '\0';
- for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
- if (invalid) mvwprintw(tv->wgoto, 1, TV_MAX_GOTO - 1, "[Invalid]");
- mvwprintw(tv->wgoto, 1, 8, "%s", str);
- }
-}
-
-
-
-
-static void tv_win_help(curses_tview_t *tv) {
- int r = 1;
- tview_t* base=(tview_t*)base;
- WINDOW *win = tv->whelp;
- wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
- mvwprintw(win, r++, 2, " -=- Help -=- ");
- r++;
- mvwprintw(win, r++, 2, "? This window");
- mvwprintw(win, r++, 2, "Arrows Small scroll movement");
- mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement");
- mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement");
- mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left");
- mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right");
- mvwprintw(win, r++, 2, "space Scroll one screen");
- mvwprintw(win, r++, 2, "backspace Scroll back one screen");
- mvwprintw(win, r++, 2, "g Go to specific location");
- mvwprintw(win, r++, 2, "m Color for mapping qual");
- mvwprintw(win, r++, 2, "n Color for nucleotide");
- mvwprintw(win, r++, 2, "b Color for base quality");
- mvwprintw(win, r++, 2, "c Color for cs color");
- mvwprintw(win, r++, 2, "z Color for cs qual");
- mvwprintw(win, r++, 2, ". Toggle on/off dot view");
- mvwprintw(win, r++, 2, "s Toggle on/off ref skip");
- mvwprintw(win, r++, 2, "r Toggle on/off rd name");
- mvwprintw(win, r++, 2, "N Turn on nt view");
- mvwprintw(win, r++, 2, "C Turn on cs view");
- mvwprintw(win, r++, 2, "i Toggle on/off ins");
- mvwprintw(win, r++, 2, "v Inverse video");
- mvwprintw(win, r++, 2, "q Exit");
- r++;
- mvwprintw(win, r++, 2, "Underline: Secondary or orphan");
- mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19");
- mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30");
- wrefresh(win);
- wgetch(win);
-}
-
-static int curses_underline(tview_t* tv)
- {
- return A_UNDERLINE;
- }
-
-static int curses_loop(tview_t* tv)
- {
- int tid, pos;
- curses_tview_t *CTV=(curses_tview_t *)tv;
- tid = tv->curr_tid; pos = tv->left_pos;
- while (1) {
- int c = getch();
- switch (c) {
- case '?': tv_win_help(CTV); break;
- case '\033':
- case 'q': goto end_loop;
- case '/':
- case 'g': tv_win_goto(CTV, &tid, &pos); break;
- case 'm': tv->color_for = TV_COLOR_MAPQ; break;
- case 'b': tv->color_for = TV_COLOR_BASEQ; break;
- case 'n': tv->color_for = TV_COLOR_NUCL; break;
- case 'c': tv->color_for = TV_COLOR_COL; break;
- case 'z': tv->color_for = TV_COLOR_COLQ; break;
- case 'v': curses_init_colors(tv->inverse = !tv->inverse); break;
- case 's': tv->no_skip = !tv->no_skip; break;
- case 'r': tv->show_name = !tv->show_name; break;
- case KEY_LEFT:
- case 'h': --pos; break;
- case KEY_RIGHT:
- case 'l': ++pos; break;
- case KEY_SLEFT:
- case 'H': pos -= 20; break;
- case KEY_SRIGHT:
- case 'L': pos += 20; break;
- case '.': tv->is_dot = !tv->is_dot; break;
- case 'N': tv->base_for = TV_BASE_NUCL; break;
- case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break;
- case 'i': tv->ins = !tv->ins; break;
- case '\010': pos -= 1000; break;
- case '\014': pos += 1000; break;
- case ' ': pos += tv->mcol; break;
- case KEY_UP:
- case 'j': --tv->row_shift; break;
- case KEY_DOWN:
- case 'k': ++tv->row_shift; break;
- case KEY_BACKSPACE:
- case '\177': pos -= tv->mcol; break;
- case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
- default: continue;
- }
- if (pos < 0) pos = 0;
- if (tv->row_shift < 0) tv->row_shift = 0;
- tv->my_drawaln(tv, tid, pos);
- }
-end_loop:
- return 0;
-}
-
-
-
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(pysam_stderr,"Calloc failed\n");
- return 0;
- }
-
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=curses_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
- initscr();
- keypad(stdscr, TRUE);
- clear();
- noecho();
- cbreak();
-
- getmaxyx(stdscr, base->mrow, base->mcol);
- tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
- tv->whelp = newwin(30, 40, 5, 5);
-
- start_color();
- curses_init_colors(0);
- return base;
- }
-
-#else // !HAVE_CURSES
-
-#warning "No curses library is available; tview with curses is disabled."
-
-extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt);
-
-tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- return text_tv_init(fn,fn_fa,samples,fmt);
- }
-
-#endif
+++ /dev/null
-/* bam_tview_html.c -- HTML tview output.
-
- Copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Pierre Lindenbaum <plindenbaum@yahoo.fr>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <unistd.h>
-#include "bam_tview.h"
-
-#define UNDERLINE_FLAG 10
-
-typedef struct HtmlTview {
- tview_t view;
- int row_count;
- tixel_t** screen;
- FILE* out;
- int attributes;/* color... */
- } html_tview_t;
-
-#define FROM_TV(ptr) ((html_tview_t*)ptr)
-
-static void html_destroy(tview_t* base)
- {
- int i;
- html_tview_t* tv=(html_tview_t*)base;
- if(tv->screen!=NULL)
- {
- for(i=0;i< tv->row_count;++i) free(tv->screen[i]);
- free(tv->screen);
- }
- base_tv_destroy(base);
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- int i,nchars=0;
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- nchars=vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
-
- for(i=0;i< nchars;++i)
- {
- tv->my_mvaddch(tv,y,x+i,str[i]);
- }
- free(str);
- }
-
-static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- tixel_t* row=NULL;
- html_tview_t* ptr=FROM_TV(tv);
- if( x >= tv->mcol ) return; //out of screen
- while(ptr->row_count<=y)
- {
- int x;
- row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t));
- if(row==0) exit(EXIT_FAILURE);
- for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;}
- ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1));
- ptr->screen[ptr->row_count++]=row;
- }
- row=ptr->screen[y];
- row[x].ch=ch;
- row[x].attributes=ptr->attributes;
- }
-
-static void html_attron(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes |= flag;
-
-
- }
-
-static void html_attroff(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes &= ~(flag);
- }
-
-static void html_clear(struct AbstractTview* tv)
- {
- html_tview_t* ptr=FROM_TV(tv);
- if(ptr->screen!=NULL)
- {
- int i;
- for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]);
- free(ptr->screen);
- ptr->screen=NULL;
- }
- ptr->row_count=0;
- ptr->attributes=0;
- }
-
-static int html_colorpair(struct AbstractTview* tv,int flag)
- {
- return (1 << (flag));
- }
-
-static int html_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- fputs("<html><head>",ptr->out);
- fprintf(ptr->out,"<title>%s:%d</title>",
- tv->header->target_name[tid],
- pos+1
- );
- //style
-
- fputs("<style type='text/css'>\n",ptr->out);
- fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out);
- fputs(".tviewtitle {text-align:center;}\n",ptr->out);
- fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out);
- #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col);
- CSS(0, "black");
- CSS(1, "blue");
- CSS(2, "green");
- CSS(3, "yellow");
- CSS(4, "black");
- CSS(5, "green");
- CSS(6, "cyan");
- CSS(7, "yellow");
- CSS(8, "red");
- CSS(9, "blue");
- #undef CSS
- fputs("</style>",ptr->out);
-
- fputs("</head><body>",ptr->out);
-
- fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>",
- tv->header->target_name[tid],
- pos+1
- );
-
- fputs("<pre class='tviewpre'>",ptr->out);
- for(y=0;y< ptr->row_count;++y)
- {
-
- for(x=0;x< tv->mcol;++x)
- {
-
-
- if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
- {
- int css=0;
- fprintf(ptr->out,"<span");
- while(css<32)
- {
- //if(y>1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
-
- fprintf(ptr->out," class='tviewc%s%d'",
- (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
- css);
- break;
- }
- ++css;
- }
-
-
- fputs(">",ptr->out);
- }
-
- int ch=ptr->screen[y][x].ch;
- switch(ch)
- {
- case '<': fputs("<",ptr->out);break;
- case '>': fputs(">",ptr->out);break;
- case '&': fputs("&",ptr->out);break;
- default: fputc(ch,ptr->out); break;
- }
-
-
- if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
- {
- fputs("</span>",ptr->out);
- }
- }
- if(y+1 < ptr->row_count) fputs("<br/>",ptr->out);
- }
- fputs("</pre></div></body></html>",ptr->out);
- return 0;
- }
-
-
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_BLACK "\x1b[0m"
-#define ANSI_COLOR_RESET ANSI_COLOR_BLACK
-
-#define ANSI_UNDERLINE_SET "\033[4m"
-#define ANSI_UNDERLINE_UNSET "\033[0m"
-
-static int text_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- int is_term= isatty(fileno(ptr->out));
-
- for(y=0;y< ptr->row_count;++y)
- {
- for(x=0;x< tv->mcol;++x)
- {
- if(is_term)
- {
- int css=0;
- while(css<32)
- {
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
- break;
- }
- ++css;
- }
- switch(css)
- {
- //CSS(0, "black");
- case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- //CSS(4, "black");
- case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break;
- case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- case 8: fputs(ANSI_COLOR_RED,ptr->out); break;
- case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- default:break;
- }
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_SET,ptr->out);
- }
-
- }
-
-
- int ch=ptr->screen[y][x].ch;
-
- fputc(ch,ptr->out);
- if(is_term)
- {
- fputs(ANSI_COLOR_RESET,ptr->out);
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_UNSET,ptr->out);
- }
- }
- }
- fputc('\n',ptr->out);
- }
- return 0;
- }
-
-
-static int html_loop(tview_t* tv)
- {
- //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- return 0;
- }
-
-static int html_underline(tview_t* tv)
- {
- return (1 << UNDERLINE_FLAG);
- }
-
-/*
-static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper)
- {
-
- }
-*/
-
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- char* colstr=getenv("COLUMNS");
- html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(stderr,"Calloc failed\n");
- return 0;
- }
- tv->row_count=0;
- tv->screen=NULL;
- tv->out=stdout;
- tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
-
- if(colstr!=0)
- {
- base->mcol=atoi(colstr);
- if(base->mcol<10) base->mcol=80;
- }
- base->mrow=99999;
-
-/*
- init_pair(tv,1, "blue", "white");
- init_pair(tv,2, "green", "white");
- init_pair(tv,3, "yellow", "white");
- init_pair(tv,4, "white", "white");
- init_pair(tv,5, "green", "white");
- init_pair(tv,6, "cyan", "white");
- init_pair(tv,7, "yellow", "white");
- init_pair(tv,8, "red", "white");
- init_pair(tv,9, "blue", "white");
- */
- return base;
- }
-
-
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt);
- tv->my_drawaln=text_drawaln;
- return tv;
- }
-
+++ /dev/null
-#include "pysam.h"
-
-/* bam_tview_html.c -- HTML tview output.
-
- Copyright (C) 2013 Pierre Lindenbaum, Institut du Thorax, INSERM U1087, Université de Nantes.
-
- Author: Pierre Lindenbaum <plindenbaum@yahoo.fr>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <unistd.h>
-#include "bam_tview.h"
-
-#define UNDERLINE_FLAG 10
-
-typedef struct HtmlTview {
- tview_t view;
- int row_count;
- tixel_t** screen;
- FILE* out;
- int attributes;/* color... */
- } html_tview_t;
-
-#define FROM_TV(ptr) ((html_tview_t*)ptr)
-
-static void html_destroy(tview_t* base)
- {
- int i;
- html_tview_t* tv=(html_tview_t*)base;
- if(tv->screen!=NULL)
- {
- for(i=0;i< tv->row_count;++i) free(tv->screen[i]);
- free(tv->screen);
- }
- base_tv_destroy(base);
- free(tv);
- }
-
-/*
- void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...);
- void (*my_)(struct AbstractTview*,int,int,int);
- void (*my_attron)(struct AbstractTview*,int);
- void (*my_attroff)(struct AbstractTview*,int);
- void (*my_clear)(struct AbstractTview*);
- int (*my_colorpair)(struct AbstractTview*,int);
-*/
-
-static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...)
- {
- int i,nchars=0;
- unsigned int size=tv->mcol+2;
- char* str=malloc(size);
- if(str==0) exit(EXIT_FAILURE);
- va_list argptr;
- va_start(argptr, fmt);
- nchars=vsnprintf(str,size, fmt, argptr);
- va_end(argptr);
-
- for(i=0;i< nchars;++i)
- {
- tv->my_mvaddch(tv,y,x+i,str[i]);
- }
- free(str);
- }
-
-static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch)
- {
- tixel_t* row=NULL;
- html_tview_t* ptr=FROM_TV(tv);
- if( x >= tv->mcol ) return; //out of screen
- while(ptr->row_count<=y)
- {
- int x;
- row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t));
- if(row==0) exit(EXIT_FAILURE);
- for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;}
- ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1));
- ptr->screen[ptr->row_count++]=row;
- }
- row=ptr->screen[y];
- row[x].ch=ch;
- row[x].attributes=ptr->attributes;
- }
-
-static void html_attron(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes |= flag;
-
-
- }
-
-static void html_attroff(struct AbstractTview* tv,int flag)
- {
- html_tview_t* ptr=FROM_TV(tv);
- ptr->attributes &= ~(flag);
- }
-
-static void html_clear(struct AbstractTview* tv)
- {
- html_tview_t* ptr=FROM_TV(tv);
- if(ptr->screen!=NULL)
- {
- int i;
- for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]);
- free(ptr->screen);
- ptr->screen=NULL;
- }
- ptr->row_count=0;
- ptr->attributes=0;
- }
-
-static int html_colorpair(struct AbstractTview* tv,int flag)
- {
- return (1 << (flag));
- }
-
-static int html_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- fputs("<html><head>",ptr->out);
- fprintf(ptr->out,"<title>%s:%d</title>",
- tv->header->target_name[tid],
- pos+1
- );
- //style
-
- fputs("<style type='text/css'>\n",ptr->out);
- fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out);
- fputs(".tviewtitle {text-align:center;}\n",ptr->out);
- fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out);
- #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col);
- CSS(0, "black");
- CSS(1, "blue");
- CSS(2, "green");
- CSS(3, "yellow");
- CSS(4, "black");
- CSS(5, "green");
- CSS(6, "cyan");
- CSS(7, "yellow");
- CSS(8, "red");
- CSS(9, "blue");
- #undef CSS
- fputs("</style>",ptr->out);
-
- fputs("</head><body>",ptr->out);
-
- fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>",
- tv->header->target_name[tid],
- pos+1
- );
-
- fputs("<pre class='tviewpre'>",ptr->out);
- for(y=0;y< ptr->row_count;++y)
- {
-
- for(x=0;x< tv->mcol;++x)
- {
-
-
- if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
- {
- int css=0;
- fprintf(ptr->out,"<span");
- while(css<32)
- {
- //if(y>1) fprintf(pysam_stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
-
- fprintf(ptr->out," class='tviewc%s%d'",
- (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
- css);
- break;
- }
- ++css;
- }
-
-
- fputs(">",ptr->out);
- }
-
- int ch=ptr->screen[y][x].ch;
- switch(ch)
- {
- case '<': fputs("<",ptr->out);break;
- case '>': fputs(">",ptr->out);break;
- case '&': fputs("&",ptr->out);break;
- default: fputc(ch,ptr->out); break;
- }
-
-
- if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
- {
- fputs("</span>",ptr->out);
- }
- }
- if(y+1 < ptr->row_count) fputs("<br/>",ptr->out);
- }
- fputs("</pre></div></body></html>",ptr->out);
- return 0;
- }
-
-
-#define ANSI_COLOR_RED "\x1b[31m"
-#define ANSI_COLOR_GREEN "\x1b[32m"
-#define ANSI_COLOR_YELLOW "\x1b[33m"
-#define ANSI_COLOR_BLUE "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN "\x1b[36m"
-#define ANSI_COLOR_BLACK "\x1b[0m"
-#define ANSI_COLOR_RESET ANSI_COLOR_BLACK
-
-#define ANSI_UNDERLINE_SET "\033[4m"
-#define ANSI_UNDERLINE_UNSET "\033[0m"
-
-static int text_drawaln(struct AbstractTview* tv, int tid, int pos)
- {
- int y,x;
- html_tview_t* ptr=FROM_TV(tv);
- html_clear(tv);
- base_draw_aln(tv, tid, pos);
- int is_term= isatty(fileno(ptr->out));
-
- for(y=0;y< ptr->row_count;++y)
- {
- for(x=0;x< tv->mcol;++x)
- {
- if(is_term)
- {
- int css=0;
- while(css<32)
- {
- if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
- {
- break;
- }
- ++css;
- }
- switch(css)
- {
- //CSS(0, "black");
- case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- //CSS(4, "black");
- case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break;
- case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break;
- case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break;
- case 8: fputs(ANSI_COLOR_RED,ptr->out); break;
- case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break;
- default:break;
- }
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_SET,ptr->out);
- }
-
- }
-
-
- int ch=ptr->screen[y][x].ch;
-
- fputc(ch,ptr->out);
- if(is_term)
- {
- fputs(ANSI_COLOR_RESET,ptr->out);
- if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0)
- {
- fputs(ANSI_UNDERLINE_UNSET,ptr->out);
- }
- }
- }
- fputc('\n',ptr->out);
- }
- return 0;
- }
-
-
-static int html_loop(tview_t* tv)
- {
- //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos);
- return 0;
- }
-
-static int html_underline(tview_t* tv)
- {
- return (1 << UNDERLINE_FLAG);
- }
-
-/*
-static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper)
- {
-
- }
-*/
-
-tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- char* colstr=getenv("COLUMNS");
- html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t));
- tview_t* base=(tview_t*)tv;
- if(tv==0)
- {
- fprintf(pysam_stderr,"Calloc failed\n");
- return 0;
- }
- tv->row_count=0;
- tv->screen=NULL;
- tv->out=pysam_stdout;
- tv->attributes=0;
- base_tv_init(base,fn,fn_fa,samples,fmt);
- /* initialize callbacks */
-#define SET_CALLBACK(fun) base->my_##fun=html_##fun;
- SET_CALLBACK(destroy);
- SET_CALLBACK(mvprintw);
- SET_CALLBACK(mvaddch);
- SET_CALLBACK(attron);
- SET_CALLBACK(attroff);
- SET_CALLBACK(clear);
- SET_CALLBACK(colorpair);
- SET_CALLBACK(drawaln);
- SET_CALLBACK(loop);
- SET_CALLBACK(underline);
-#undef SET_CALLBACK
-
-
- if(colstr!=0)
- {
- base->mcol=atoi(colstr);
- if(base->mcol<10) base->mcol=80;
- }
- base->mrow=99999;
-
-/*
- init_pair(tv,1, "blue", "white");
- init_pair(tv,2, "green", "white");
- init_pair(tv,3, "yellow", "white");
- init_pair(tv,4, "white", "white");
- init_pair(tv,5, "green", "white");
- init_pair(tv,6, "cyan", "white");
- init_pair(tv,7, "yellow", "white");
- init_pair(tv,8, "red", "white");
- init_pair(tv,9, "blue", "white");
- */
- return base;
- }
-
-
-tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples,
- const htsFormat *fmt)
- {
- tview_t* tv=html_tv_init(fn,fn_fa,samples,fmt);
- tv->my_drawaln=text_drawaln;
- return tv;
- }
-
#include "htslib/hts.h"
#include "htslib/ksort.h"
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#define DEF_CLEVEL 1
bam_hdr_t *h = NULL;
int64_t j, max_cnt = 0, *cnt = NULL;
elem_t *a = NULL;
+ htsThreadPool p = {NULL, 0};
+
+ if (ga->nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga->nthreads))) {
+ print_error_errno("collate", "Error creating thread pool\n");
+ return 1;
+ }
+ }
// Read input, distribute reads pseudo-randomly into n_files temporary
// files.
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
h = sam_hdr_read(fp);
if (h == NULL) {
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
+ if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpw, h) < 0) {
print_error_errno("collate", "Couldn't write header");
print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]);
goto fail;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header
// Slurp in one of the split files
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
return 0;
mem_fail:
free(fnt);
free(fpt);
free(cnt);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(ga);
return 1;
}
static int usage(FILE *fp, int n_files) {
fprintf(fp,
- "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
+ "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] <in.bam> <out.prefix>\n\n"
"Options:\n"
" -O output to stdout\n"
" -u uncompressed BAM output\n"
" -n INT number of temporary files [%d]\n", // n_files
DEF_CLEVEL, n_files);
- sam_global_opt_help(fp, "-....");
+ sam_global_opt_help(fp, "-....@");
return 1;
}
int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
#include "htslib/hts.h"
#include "htslib/ksort.h"
#include "samtools.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#define DEF_CLEVEL 1
bam_hdr_t *h = NULL;
int64_t j, max_cnt = 0, *cnt = NULL;
elem_t *a = NULL;
+ htsThreadPool p = {NULL, 0};
+
+ if (ga->nthreads > 0) {
+ if (!(p.pool = hts_tpool_init(ga->nthreads))) {
+ print_error_errno("collate", "Error creating thread pool\n");
+ return 1;
+ }
+ }
// Read input, distribute reads pseudo-randomly into n_files temporary
// files.
print_error_errno("collate", "Cannot open input file \"%s\"", fn);
return 1;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
h = sam_hdr_read(fp);
if (h == NULL) {
else print_error_errno("collate", "Cannot open output file \"%s.bam\"", pre);
goto fail;
}
+ if (p.pool) hts_set_opt(fpw, HTS_OPT_THREAD_POOL, &p);
if (sam_hdr_write(fpw, h) < 0) {
print_error_errno("collate", "Couldn't write header");
print_error_errno("collate", "Couldn't open \"%s\"", fnt[i]);
goto fail;
}
+ if (p.pool) hts_set_opt(fp, HTS_OPT_THREAD_POOL, &p);
bam_hdr_destroy(sam_hdr_read(fp)); // Skip over header
// Slurp in one of the split files
return 1;
}
+ if (p.pool) hts_tpool_destroy(p.pool);
return 0;
mem_fail:
free(fnt);
free(fpt);
free(cnt);
+ if (p.pool) hts_tpool_destroy(p.pool);
sam_global_args_free(ga);
return 1;
}
static int usage(FILE *fp, int n_files) {
fprintf(fp,
- "Usage: samtools collate [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"
+ "Usage: samtools collate [-Ou] [-n nFiles] [-l cLevel] <in.bam> <out.prefix>\n\n"
"Options:\n"
" -O output to pysam_stdout\n"
" -u uncompressed BAM output\n"
" -n INT number of temporary files [%d]\n", // n_files
DEF_CLEVEL, n_files);
- sam_global_opt_help(fp, "-....");
+ sam_global_opt_help(fp, "-....@");
return 1;
}
int c, n_files = 64, clevel = DEF_CLEVEL, is_pysam_stdout = 0, is_un = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '@'),
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "n:l:uO", lopts, NULL)) >= 0) {
+ while ((c = getopt_long(argc, argv, "n:l:uO@:", lopts, NULL)) >= 0) {
switch (c) {
case 'n': n_files = atoi(optarg); break;
case 'l': clevel = atoi(optarg); break;
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2016 Genome Research Ltd.
+ Copyright (C) 2008-2017 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
-#include <stdarg.h>
#include <string.h>
-#include <errno.h>
+
#include "htslib/hts.h"
#include "samtools.h"
#include "version.h"
return SAMTOOLS_VERSION;
}
-static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
-{
- fflush(stdout);
- if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand);
- else fprintf(stderr, "samtools: ");
- vfprintf(stderr, format, args);
- if (extra) fprintf(stderr, ": %s\n", extra);
- else fprintf(stderr, "\n");
- fflush(stderr);
-}
-
-void print_error(const char *subcommand, const char *format, ...)
-{
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, NULL);
- va_end(args);
-}
-
-void print_error_errno(const char *subcommand, const char *format, ...)
-{
- int err = errno;
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, strerror(err));
- va_end(args);
-}
-
static void usage(FILE *fp)
{
/* Please improve the grouping */
printf(
"samtools %s\n"
"Using htslib %s\n"
-"Copyright (C) 2016 Genome Research Ltd.\n",
+"Copyright (C) 2017 Genome Research Ltd.\n",
samtools_version(), hts_version());
}
else if (strcmp(argv[1], "--version-only") == 0) {
/* bamtk.c -- main samtools command front-end.
- Copyright (C) 2008-2016 Genome Research Ltd.
+ Copyright (C) 2008-2017 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
-#include <stdarg.h>
#include <string.h>
-#include <errno.h>
+
#include "htslib/hts.h"
#include "samtools.h"
#include "version.h"
int bam_merge(int argc, char *argv[]);
int bam_index(int argc, char *argv[]);
int bam_sort(int argc, char *argv[]);
-int bam_tview_main(int argc, char *argv[]);
+/* AH: int bam_tview_main(int argc, char *argv[]); */
int bam_mating(int argc, char *argv[]);
int bam_rmdup(int argc, char *argv[]);
int bam_flagstat(int argc, char *argv[]);
return SAMTOOLS_VERSION;
}
-static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
-{
- fflush(pysam_stdout);
- if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand);
- else fprintf(pysam_stderr, "samtools: ");
- vfprintf(pysam_stderr, format, args);
- if (extra) fprintf(pysam_stderr, ": %s\n", extra);
- else fprintf(pysam_stderr, "\n");
- fflush(pysam_stderr);
-}
-
-void print_error(const char *subcommand, const char *format, ...)
-{
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, NULL);
- va_end(args);
-}
-
-void print_error_errno(const char *subcommand, const char *format, ...)
-{
- int err = errno;
- va_list args;
- va_start(args, format);
- vprint_error_core(subcommand, format, args, strerror(err));
- va_end(args);
-}
-
static void usage(FILE *fp)
{
/* Please improve the grouping */
fprintf(pysam_stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n");
return 1;
}
- else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1);
+/* AH: else if (strcmp(argv[1], "tview") == 0) ret = bam_tview_main(argc-1, argv+1); */
else if (strcmp(argv[1], "--version") == 0) {
fprintf(pysam_stdout,
"samtools %s\n"
"Using htslib %s\n"
-"Copyright (C) 2016 Genome Research Ltd.\n",
+"Copyright (C) 2017 Genome Research Ltd.\n",
samtools_version(), hts_version());
}
else if (strcmp(argv[1], "--version-only") == 0) {
#include <unistd.h>
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kseq.h"
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
}
if (usage || optind + 2 > argc) {
fprintf(stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
- fprintf(stderr, " -Q INT Only count bases of at least INT quality [0]\n");
- sam_global_opt_help(stderr, "-.--.");
+ fprintf(stderr, "Options:\n");
+ fprintf(stderr, " -Q <int> mapping quality threshold [0]\n");
+ sam_global_opt_help(stderr, "-.--.-");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
#include <unistd.h>
#include "htslib/kstring.h"
#include "htslib/sam.h"
+#include "htslib/thread_pool.h"
#include "sam_opts.h"
#include "htslib/kseq.h"
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '-'),
{ NULL, 0, NULL, 0 }
};
}
if (usage || optind + 2 > argc) {
fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
- fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n");
- sam_global_opt_help(pysam_stderr, "-.--.");
+ fprintf(pysam_stderr, "Options:\n");
+ fprintf(pysam_stderr, " -Q <int> mapping quality threshold [0]\n");
+ sam_global_opt_help(pysam_stderr, "-.--.-");
return 1;
}
memset(&str, 0, sizeof(kstring_t));
/* cut_target.c -- targetcut subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2012-2013, 2015 Genome Research Ltd.
+ Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
-#include "errmod.h"
#include "htslib/faidx.h"
+#include "samtools.h"
#include "sam_opts.h"
#define ERR_DEP 0.83
static int read_aln(void *data, bam1_t *b)
{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
ct_t *g = (ct_t*)data;
int ret;
while (1)
g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len);
g->tid = b->core.tid;
}
- bam_prob_realn_core(b, g->ref, g->len, 1<<1|1);
+ sam_prob_realn(b, g->ref, g->len, 1<<1|1);
}
break;
}
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'),
{ NULL, 0, NULL, 0 }
};
}
if (usage || argc == optind) {
fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
- sam_global_opt_help(stderr, "-.--f");
+ sam_global_opt_help(stderr, "-.--f-");
return 1;
}
l = max_l = 0; cns = 0;
g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ if (g.fp == NULL) {
+ print_error_errno("targetcut", "can't open \"%s\"", argv[optind]);
+ return 1;
+ }
+
g.h = sam_hdr_read(g.fp);
if (g.h == NULL) {
- fprintf(stderr, "Couldn't read header for '%s'\n", argv[optind]);
+ print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]);
sam_close(g.fp);
return 1;
}
/* cut_target.c -- targetcut subcommand.
Copyright (C) 2011 Broad Institute.
- Copyright (C) 2012-2013, 2015 Genome Research Ltd.
+ Copyright (C) 2012-2013, 2015, 2016 Genome Research Ltd.
Author: Heng Li <lh3@sanger.ac.uk>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
-#include "errmod.h"
#include "htslib/faidx.h"
+#include "samtools.h"
#include "sam_opts.h"
#define ERR_DEP 0.83
static int read_aln(void *data, bam1_t *b)
{
- extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag);
ct_t *g = (ct_t*)data;
int ret;
while (1)
g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &g->len);
g->tid = b->core.tid;
}
- bam_prob_realn_core(b, g->ref, g->len, 1<<1|1);
+ sam_prob_realn(b, g->ref, g->len, 1<<1|1);
}
break;
}
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 'f', '-'),
{ NULL, 0, NULL, 0 }
};
}
if (usage || argc == optind) {
fprintf(pysam_stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] <in.bam>\n");
- sam_global_opt_help(pysam_stderr, "-.--f");
+ sam_global_opt_help(pysam_stderr, "-.--f-");
return 1;
}
l = max_l = 0; cns = 0;
g.fp = sam_open_format(argv[optind], "r", &ga.in);
+ if (g.fp == NULL) {
+ print_error_errno("targetcut", "can't open \"%s\"", argv[optind]);
+ return 1;
+ }
+
g.h = sam_hdr_read(g.fp);
if (g.h == NULL) {
- fprintf(pysam_stderr, "Couldn't read header for '%s'\n", argv[optind]);
+ print_error("targetcut", "couldn't read header for \"%s\"", argv[optind]);
sam_close(g.fp);
return 1;
}
+++ /dev/null
-/* errmod.c -- revised MAQ error model.
-
- Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <math.h>
-#include "errmod.h"
-#include "htslib/ksort.h"
-KSORT_INIT_GENERIC(uint16_t)
-
-/* table of constants generated for given depcorr and eta */
-typedef struct __errmod_coef_t {
- double *fk, *beta, *lhet;
-} errmod_coef_t;
-
-typedef struct {
- double fsum[16], bsum[16];
- uint32_t c[16];
-} call_aux_t;
-
-/* \Gamma(n) = (n-1)! */
-#define lfact(n) lgamma(n+1)
-
-/* generates a success * trials table of bionomial probability densities (log transformed) */
-static double* logbinomial_table( const int n_size )
-{
- /* prob distribution for binom var is p(k) = {n! \over k! (n-k)! } p^k (1-p)^{n-k} */
- /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */
- int k, n;
- double *logbinom = (double*)calloc(n_size * n_size, sizeof(double));
- for (n = 1; n < n_size; ++n) {
- double lfn = lfact(n);
- for (k = 1; k <= n; ++k)
- logbinom[n<<8|k] = lfn - lfact(k) - lfact(n-k);
- }
- return logbinom;
-}
-
-static errmod_coef_t *cal_coef(double depcorr, double eta)
-{
- int k, n, q;
- long double sum, sum1;
- double *lC;
- errmod_coef_t *ec;
-
- ec = calloc(1, sizeof(errmod_coef_t));
- // initialize ->fk
- ec->fk = (double*)calloc(256, sizeof(double));
- ec->fk[0] = 1.0;
- for (n = 1; n < 256; ++n)
- ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
- // initialize ->coef
- ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
-
- lC = logbinomial_table( 256 );
-
- for (q = 1; q < 64; ++q) {
- double e = pow(10.0, -q/10.0);
- double le = log(e);
- double le1 = log(1.0 - e);
- for (n = 1; n <= 255; ++n) {
- double *beta = ec->beta + (q<<16|n<<8);
- sum1 = sum = 0.0;
- for (k = n; k >= 0; --k, sum1 = sum) {
- sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
- beta[k] = -10. / M_LN10 * logl(sum1 / sum);
- }
- }
- }
- // initialize ->lhet
- ec->lhet = (double*)calloc(256 * 256, sizeof(double));
- for (n = 0; n < 256; ++n)
- for (k = 0; k < 256; ++k)
- ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
- free(lC);
- return ec;
-}
-
-/**
- * Create errmod_t object with obj.depcorr set to depcorr and initialise
- */
-errmod_t *errmod_init(double depcorr)
-{
- errmod_t *em;
- em = (errmod_t*)calloc(1, sizeof(errmod_t));
- em->depcorr = depcorr;
- em->coef = cal_coef(depcorr, 0.03);
- return em;
-}
-
-/**
- * Deallocate an errmod_t object
- */
-void errmod_destroy(errmod_t *em)
-{
- if (em == 0) return;
- free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
- free(em->coef); free(em);
-}
-
-//
-// em: error model to fit to data
-// m: number of alleles across all samples
-// n: number of bases observed in sample
-// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base]
-// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j)
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
-{
- // Aux
- // aux.c is total count of each base observed (ignoring strand)
- call_aux_t aux;
- // Loop variables
- int i, j, k;
- // The total count of each base observed per strand
- int w[32];
-
- memset(q, 0, m * m * sizeof(float)); // initialise q to 0
- if (n == 0) return 0;
- // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix
- if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255
- ks_shuffle(uint16_t, n, bases);
- n = 255;
- }
- ks_introsort(uint16_t, n, bases);
- /* zero out w and aux */
- memset(w, 0, 32 * sizeof(int));
- memset(&aux, 0, sizeof(call_aux_t));
-
- for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
- uint16_t b = bases[j];
- /* extract quality and cap at 63 */
- int qual = b>>5 < 4? 4 : b>>5;
- if (qual > 63) qual = 63;
- /* extract base ORed with strand */
- int basestrand = b&0x1f;
- /* extract base */
- int base = b&0xf;
- aux.fsum[base] += em->coef->fk[w[basestrand]];
- aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]];
- ++aux.c[base];
- ++w[basestrand];
- }
-
- // generate likelihood
- for (j = 0; j < m; ++j) {
- float tmp1, tmp3;
- int tmp2;
- // homozygous
- for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) {
- if (k == j) continue;
- tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
- }
- if (tmp2) {
- q[j*m+j] = tmp1;
- }
- // heterozygous
- for (k = j + 1; k < m; ++k) {
- int cjk = aux.c[j] + aux.c[k];
- for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
- if (i == j || i == k) continue;
- tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
- }
- if (tmp2) {
- q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
- } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
- }
- /* clamp to greater than 0 */
- for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
- }
-
- return 0;
-}
+++ /dev/null
-#include "pysam.h"
-
-/* errmod.c -- revised MAQ error model.
-
- Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012, 2013 Genome Research Ltd.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#include <config.h>
-
-#include <math.h>
-#include "errmod.h"
-#include "htslib/ksort.h"
-KSORT_INIT_GENERIC(uint16_t)
-
-/* table of constants generated for given depcorr and eta */
-typedef struct __errmod_coef_t {
- double *fk, *beta, *lhet;
-} errmod_coef_t;
-
-typedef struct {
- double fsum[16], bsum[16];
- uint32_t c[16];
-} call_aux_t;
-
-/* \Gamma(n) = (n-1)! */
-#define lfact(n) lgamma(n+1)
-
-/* generates a success * trials table of bionomial probability densities (log transformed) */
-static double* logbinomial_table( const int n_size )
-{
- /* prob distribution for binom var is p(k) = {n! \over k! (n-k)! } p^k (1-p)^{n-k} */
- /* this calcs p(k) = {log(n!) - log(k!) - log((n-k)!) */
- int k, n;
- double *logbinom = (double*)calloc(n_size * n_size, sizeof(double));
- for (n = 1; n < n_size; ++n) {
- double lfn = lfact(n);
- for (k = 1; k <= n; ++k)
- logbinom[n<<8|k] = lfn - lfact(k) - lfact(n-k);
- }
- return logbinom;
-}
-
-static errmod_coef_t *cal_coef(double depcorr, double eta)
-{
- int k, n, q;
- long double sum, sum1;
- double *lC;
- errmod_coef_t *ec;
-
- ec = calloc(1, sizeof(errmod_coef_t));
- // initialize ->fk
- ec->fk = (double*)calloc(256, sizeof(double));
- ec->fk[0] = 1.0;
- for (n = 1; n < 256; ++n)
- ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
- // initialize ->coef
- ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
-
- lC = logbinomial_table( 256 );
-
- for (q = 1; q < 64; ++q) {
- double e = pow(10.0, -q/10.0);
- double le = log(e);
- double le1 = log(1.0 - e);
- for (n = 1; n <= 255; ++n) {
- double *beta = ec->beta + (q<<16|n<<8);
- sum1 = sum = 0.0;
- for (k = n; k >= 0; --k, sum1 = sum) {
- sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
- beta[k] = -10. / M_LN10 * logl(sum1 / sum);
- }
- }
- }
- // initialize ->lhet
- ec->lhet = (double*)calloc(256 * 256, sizeof(double));
- for (n = 0; n < 256; ++n)
- for (k = 0; k < 256; ++k)
- ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
- free(lC);
- return ec;
-}
-
-/**
- * Create errmod_t object with obj.depcorr set to depcorr and initialise
- */
-errmod_t *errmod_init(double depcorr)
-{
- errmod_t *em;
- em = (errmod_t*)calloc(1, sizeof(errmod_t));
- em->depcorr = depcorr;
- em->coef = cal_coef(depcorr, 0.03);
- return em;
-}
-
-/**
- * Deallocate an errmod_t object
- */
-void errmod_destroy(errmod_t *em)
-{
- if (em == 0) return;
- free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
- free(em->coef); free(em);
-}
-
-//
-// em: error model to fit to data
-// m: number of alleles across all samples
-// n: number of bases observed in sample
-// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base]
-// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j)
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
-{
- // Aux
- // aux.c is total count of each base observed (ignoring strand)
- call_aux_t aux;
- // Loop variables
- int i, j, k;
- // The total count of each base observed per strand
- int w[32];
-
- memset(q, 0, m * m * sizeof(float)); // initialise q to 0
- if (n == 0) return 0;
- // This section randomly downsamples to 255 depth so as not to go beyond our precalculated matrix
- if (n > 255) { // if we exceed 255 bases observed then shuffle them to sample and only keep the first 255
- ks_shuffle(uint16_t, n, bases);
- n = 255;
- }
- ks_introsort(uint16_t, n, bases);
- /* zero out w and aux */
- memset(w, 0, 32 * sizeof(int));
- memset(&aux, 0, sizeof(call_aux_t));
-
- for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
- uint16_t b = bases[j];
- /* extract quality and cap at 63 */
- int qual = b>>5 < 4? 4 : b>>5;
- if (qual > 63) qual = 63;
- /* extract base ORed with strand */
- int basestrand = b&0x1f;
- /* extract base */
- int base = b&0xf;
- aux.fsum[base] += em->coef->fk[w[basestrand]];
- aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]];
- ++aux.c[base];
- ++w[basestrand];
- }
-
- // generate likelihood
- for (j = 0; j < m; ++j) {
- float tmp1, tmp3;
- int tmp2;
- // homozygous
- for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) {
- if (k == j) continue;
- tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
- }
- if (tmp2) {
- q[j*m+j] = tmp1;
- }
- // heterozygous
- for (k = j + 1; k < m; ++k) {
- int cjk = aux.c[j] + aux.c[k];
- for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
- if (i == j || i == k) continue;
- tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
- }
- if (tmp2) {
- q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
- } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
- }
- /* clamp to greater than 0 */
- for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
- }
-
- return 0;
-}
+++ /dev/null
-/* errmod.h -- revised MAQ error model.
-
- Copyright (C) 2010 Broad Institute.
- Copyright (C) 2012 Genome Research Ltd.
-
- Author: Heng Li <lh3@sanger.ac.uk>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE. */
-
-#ifndef ERRMOD_H
-#define ERRMOD_H
-
-#include <stdint.h>
-
-struct __errmod_coef_t;
-
-typedef struct {
- double depcorr;
- struct __errmod_coef_t *coef;
-} errmod_t;
-
-errmod_t *errmod_init(double depcorr);
-void errmod_destroy(errmod_t *em);
-
-/*
- n: number of bases
- m: maximum base
- bases[i]: qual:6, strand:1, base:4
- q[i*m+j]: phred-scaled likelihood of (i,j)
- */
-int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
-
-#endif
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <config.h>
-#include <ctype.h>
-#include <string.h>
#include <stdlib.h>
#include <stdio.h>
-#include <stdint.h>
#include <unistd.h>
-#include <stdarg.h>
+
#include <htslib/faidx.h>
+#include "samtools.h"
-static void error(const char *format, ...)
+static int usage(FILE *fp, int exit_status)
{
- if ( format )
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- }
- else
- {
- fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
- fprintf(stderr, "\n");
- }
- exit(-1);
+ fprintf(fp, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
+ return exit_status;
}
-
int faidx_main(int argc, char *argv[])
{
int c;
switch(c)
{
case 'h':
+ return usage(stdout, EXIT_SUCCESS);
+
default:
- error(NULL);
+ return usage(stderr, EXIT_FAILURE);
}
}
if ( argc==optind )
- error(NULL);
+ return usage(stdout, EXIT_SUCCESS);
if ( argc==2 )
{
if (fai_build(argv[optind]) != 0) {
- error("Could not build fai index %s.fai\n", argv[optind]);
+ fprintf(stderr, "Could not build fai index %s.fai\n", argv[optind]);
+ return EXIT_FAILURE;
}
return 0;
}
faidx_t *fai = fai_load(argv[optind]);
- if ( !fai ) error("Could not load fai index of %s\n", argv[optind]);
+ if ( !fai ) {
+ fprintf(stderr, "Could not load fai index of %s\n", argv[optind]);
+ return EXIT_FAILURE;
+ }
+
+ int exit_status = EXIT_SUCCESS;
- while ( ++optind<argc )
+ while ( ++optind<argc && exit_status == EXIT_SUCCESS)
{
printf(">%s\n", argv[optind]);
- int i, j, seq_len;
+ int seq_len;
char *seq = fai_fetch(fai, argv[optind], &seq_len);
- if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]);
- for (i=0; i<seq_len; i+=60)
+ if ( seq_len < 0 ) {
+ fprintf(stderr, "Failed to fetch sequence in %s\n", argv[optind]);
+ exit_status = EXIT_FAILURE;
+ break;
+ }
+ size_t i, seq_sz = seq_len;
+ for (i=0; i<seq_sz; i+=60)
{
- for (j=0; j<60 && i+j<seq_len; j++)
- putchar(seq[i+j]);
- putchar('\n');
+ size_t len = i + 60 < seq_sz ? 60 : seq_sz - i;
+ if (fwrite(seq + i, 1, len, stdout) < len ||
+ putchar('\n') == EOF) {
+ print_error_errno("faidx", "failed to write output");
+ exit_status = EXIT_FAILURE;
+ break;
+ }
}
free(seq);
}
fai_destroy(fai);
- return 0;
-}
+ if (fflush(stdout) == EOF) {
+ print_error_errno("faidx", "failed to flush output");
+ exit_status = EXIT_FAILURE;
+ }
+ return exit_status;
+}
/* faidx.c -- faidx subcommand.
- Copyright (C) 2008, 2009, 2013 Genome Research Ltd.
+ Copyright (C) 2008, 2009, 2013, 2016 Genome Research Ltd.
Portions copyright (C) 2011 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <config.h>
-#include <ctype.h>
-#include <string.h>
#include <stdlib.h>
#include <stdio.h>
-#include <stdint.h>
#include <unistd.h>
-#include <stdarg.h>
+
#include <htslib/faidx.h>
+#include "samtools.h"
-static void error(const char *format, ...)
+static int usage(FILE *fp, int exit_status)
{
- if ( format )
- {
- va_list ap;
- va_start(ap, format);
- vfprintf(pysam_stderr, format, ap);
- va_end(ap);
- }
- else
- {
- fprintf(pysam_stderr, "\n");
- fprintf(pysam_stderr, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
- fprintf(pysam_stderr, "\n");
- }
- exit(-1);
+ fprintf(fp, "Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]]\n");
+ return exit_status;
}
-
int faidx_main(int argc, char *argv[])
{
int c;
switch(c)
{
case 'h':
+ return usage(pysam_stdout, EXIT_SUCCESS);
+
default:
- error(NULL);
+ return usage(pysam_stderr, EXIT_FAILURE);
}
}
if ( argc==optind )
- error(NULL);
+ return usage(pysam_stdout, EXIT_SUCCESS);
if ( argc==2 )
{
if (fai_build(argv[optind]) != 0) {
- error("Could not build fai index %s.fai\n", argv[optind]);
+ fprintf(pysam_stderr, "Could not build fai index %s.fai\n", argv[optind]);
+ return EXIT_FAILURE;
}
return 0;
}
faidx_t *fai = fai_load(argv[optind]);
- if ( !fai ) error("Could not load fai index of %s\n", argv[optind]);
+ if ( !fai ) {
+ fprintf(pysam_stderr, "Could not load fai index of %s\n", argv[optind]);
+ return EXIT_FAILURE;
+ }
+
+ int exit_status = EXIT_SUCCESS;
- while ( ++optind<argc )
+ while ( ++optind<argc && exit_status == EXIT_SUCCESS)
{
fprintf(pysam_stdout, ">%s\n", argv[optind]);
- int i, j, seq_len;
+ int seq_len;
char *seq = fai_fetch(fai, argv[optind], &seq_len);
- if ( seq_len < 0 ) error("Failed to fetch sequence in %s\n", argv[optind]);
- for (i=0; i<seq_len; i+=60)
+ if ( seq_len < 0 ) {
+ fprintf(pysam_stderr, "Failed to fetch sequence in %s\n", argv[optind]);
+ exit_status = EXIT_FAILURE;
+ break;
+ }
+ size_t i, seq_sz = seq_len;
+ for (i=0; i<seq_sz; i+=60)
{
- for (j=0; j<60 && i+j<seq_len; j++)
- fputc(seq[i+j], pysam_stdout);
- fputc('\n', pysam_stdout);
+ size_t len = i + 60 < seq_sz ? 60 : seq_sz - i;
+ if (fwrite(seq + i, 1, len, pysam_stdout) < len ||
+ fputc('\n', pysam_stdout) == EOF) {
+ print_error_errno("faidx", "failed to write output");
+ exit_status = EXIT_FAILURE;
+ break;
+ }
}
free(seq);
}
fai_destroy(fai);
- return 0;
-}
+ if (fflush(pysam_stdout) == EOF) {
+ print_error_errno("faidx", "failed to flush output");
+ exit_status = EXIT_FAILURE;
+ }
+ return exit_status;
+}
+++ /dev/null
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <config.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kprobaln.h"
-
-/*****************************************
- * Probabilistic banded glocal alignment *
- *****************************************/
-
-#define EI .25
-#define EM .33333333333
-
-static float g_qual2prob[256];
-
-#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
-
-kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
-kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
-
-/*
- The topology of the profile HMM:
-
- /\ /\ /\ /\
- I[1] I[k-1] I[k] I[L]
- ^ \ \ ^ \ ^ \ \ ^
- | \ \ | \ | \ \ |
- M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
- \ \/ \/ \/ /
- \ /\ /\ /\ /
- -> D[k-1] -> D[k] ->
-
- M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
-
- On input, _ref is the reference sequence and _query is the query
- sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
- ambiguous residue. iqual is the base quality. c sets the gap open
- probability, gap extension probability and band width.
-
- On output, state and q are arrays of length l_query. The higher 30
- bits give the reference position the query base is matched to and the
- lower two bits can be 0 (an alignment match) or 1 (an
- insertion). q[i] gives the phred scaled posterior probability of
- state[i] being wrong.
- */
-int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q)
-{
- double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
- float *qual, *_qual;
- const uint8_t *ref, *query;
- int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
-
- if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
-
- /*** initialization ***/
- is_backward = state && q? 1 : 0;
- ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
- bw = l_ref > l_query? l_ref : l_query;
- if (bw > c->bw) bw = c->bw;
- if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
- bw2 = bw * 2 + 1;
- // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
- f = calloc(l_query+1, sizeof(double*));
- if (is_backward) b = calloc(l_query+1, sizeof(double*));
- for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
- f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
- if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
- }
- s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
- // initialize qual
- _qual = calloc(l_query, sizeof(float));
- if (g_qual2prob[0] == 0)
- for (i = 0; i < 256; ++i)
- g_qual2prob[i] = pow(10, -i/10.);
- for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
- qual = _qual - 1;
- // initialize transition probability
- sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
- m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
- m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
- m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
- bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
- /*** forward ***/
- // f[0]
- set_u(k, bw, 0, 0);
- f[0][k] = s[0] = 1.;
- { // f[1]
- double *fi = f[1], sum;
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- fi[u+0] = e * bM; fi[u+1] = EI * bI;
- sum += fi[u] + fi[u+1];
- }
- // rescale
- s[1] = sum;
- set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
- for (k = _beg; k <= _end; ++k) fi[k] /= sum;
- }
- // f[2..l_query]
- for (i = 2; i <= l_query; ++i) {
- double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
- int beg = 1, end = l_ref, x, _beg, _end;
- uint8_t qyi = query[i];
- x = i - bw; beg = beg > x? beg : x; // band start
- x = i + bw; end = end < x? end : x; // band end
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u, v11, v01, v10;
- double e;
- e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
- set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
- fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
- fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
- fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
- sum += fi[u] + fi[u+1] + fi[u+2];
-// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
- }
- // rescale
- s[i] = sum;
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
- }
- { // f[l_query+1]
- double sum;
- for (k = 1, sum = 0.; k <= l_ref; ++k) {
- int u;
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
- }
- s[l_query+1] = sum; // the last scaling factor
- }
- { // compute likelihood
- double p = 1., Pr1 = 0.;
- for (i = 0; i <= l_query + 1; ++i) {
- p *= s[i];
- if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
- }
- Pr1 += -4.343 * log(p * l_ref * l_query);
- Pr = (int)(Pr1 + .499);
- if (!is_backward) { // skip backward and MAP
- for (i = 0; i <= l_query; ++i) free(f[i]);
- free(f); free(s); free(_qual);
- return Pr;
- }
- }
- /*** backward ***/
- // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
- for (k = 1; k <= l_ref; ++k) {
- int u;
- double *bi = b[l_query];
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
- }
- // b[l_query-1..1]
- for (i = l_query - 1; i >= 1; --i) {
- int beg = 1, end = l_ref, x, _beg, _end;
- double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
- uint8_t qyi1 = query[i+1];
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = end; k >= beg; --k) {
- int u, v11, v01, v10;
- double e;
- set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
- e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
- bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
- bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
- bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
-// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
- }
- // rescale
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
- }
- { // b[0]
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
- double sum = 0.;
- for (k = end; k >= beg; --k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
- }
- set_u(k, bw, 0, 0);
- pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
- }
- is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
- /*** MAP ***/
- for (i = 1; i <= l_query; ++i) {
- double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
- int beg = 1, end = l_ref, x, max_k = -1;
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = beg; k <= end; ++k) {
- int u;
- double z;
- set_u(u, bw, i, k);
- z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
- z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
- }
- max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
- if (state) state[i-1] = max_k;
- if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
-#ifdef _MAIN
- fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
- "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
-#endif
- }
- /*** free ***/
- for (i = 0; i <= l_query; ++i) {
- free(f[i]); free(b[i]);
- }
- free(f); free(b); free(s); free(_qual);
- return Pr;
-}
-
-#ifdef _MAIN
-#include <unistd.h>
-int main(int argc, char *argv[])
-{
- uint8_t conv[256], *iqual, *ref, *query;
- int c, l_ref, l_query, i, q = 30, b = 10, P;
- while ((c = getopt(argc, argv, "b:q:")) >= 0) {
- switch (c) {
- case 'b': b = atoi(optarg); break;
- case 'q': q = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
- return 1;
- }
- memset(conv, 4, 256);
- conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
- conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
- ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
- l_ref = strlen((char*)ref); l_query = strlen((char*)query);
- for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
- for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
- iqual = malloc(l_query);
- memset(iqual, q, l_query);
- kpa_par_def.bw = b;
- P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
- fprintf(stderr, "%d\n", P);
- free(iqual);
- return 0;
-}
-#endif
+++ /dev/null
-#include "pysam.h"
-
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#include <config.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <math.h>
-#include "kprobaln.h"
-
-/*****************************************
- * Probabilistic banded glocal alignment *
- *****************************************/
-
-#define EI .25
-#define EM .33333333333
-
-static float g_qual2prob[256];
-
-#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
-
-kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
-kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
-
-/*
- The topology of the profile HMM:
-
- /\ /\ /\ /\
- I[1] I[k-1] I[k] I[L]
- ^ \ \ ^ \ ^ \ \ ^
- | \ \ | \ | \ \ |
- M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
- \ \/ \/ \/ /
- \ /\ /\ /\ /
- -> D[k-1] -> D[k] ->
-
- M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
-
- On input, _ref is the reference sequence and _query is the query
- sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
- ambiguous residue. iqual is the base quality. c sets the gap open
- probability, gap extension probability and band width.
-
- On output, state and q are arrays of length l_query. The higher 30
- bits give the reference position the query base is matched to and the
- lower two bits can be 0 (an alignment match) or 1 (an
- insertion). q[i] gives the phred scaled posterior probability of
- state[i] being wrong.
- */
-int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q)
-{
- double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
- float *qual, *_qual;
- const uint8_t *ref, *query;
- int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
-
- if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
-
- /*** initialization ***/
- is_backward = state && q? 1 : 0;
- ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
- bw = l_ref > l_query? l_ref : l_query;
- if (bw > c->bw) bw = c->bw;
- if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
- bw2 = bw * 2 + 1;
- // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
- f = calloc(l_query+1, sizeof(double*));
- if (is_backward) b = calloc(l_query+1, sizeof(double*));
- for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0
- f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
- if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
- }
- s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
- // initialize qual
- _qual = calloc(l_query, sizeof(float));
- if (g_qual2prob[0] == 0)
- for (i = 0; i < 256; ++i)
- g_qual2prob[i] = pow(10, -i/10.);
- for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
- qual = _qual - 1;
- // initialize transition probability
- sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
- m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
- m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
- m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
- bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
- /*** forward ***/
- // f[0]
- set_u(k, bw, 0, 0);
- f[0][k] = s[0] = 1.;
- { // f[1]
- double *fi = f[1], sum;
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- fi[u+0] = e * bM; fi[u+1] = EI * bI;
- sum += fi[u] + fi[u+1];
- }
- // rescale
- s[1] = sum;
- set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
- for (k = _beg; k <= _end; ++k) fi[k] /= sum;
- }
- // f[2..l_query]
- for (i = 2; i <= l_query; ++i) {
- double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
- int beg = 1, end = l_ref, x, _beg, _end;
- uint8_t qyi = query[i];
- x = i - bw; beg = beg > x? beg : x; // band start
- x = i + bw; end = end < x? end : x; // band end
- for (k = beg, sum = 0.; k <= end; ++k) {
- int u, v11, v01, v10;
- double e;
- e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
- set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
- fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
- fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
- fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
- sum += fi[u] + fi[u+1] + fi[u+2];
-// fprintf(pysam_stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
- }
- // rescale
- s[i] = sum;
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
- }
- { // f[l_query+1]
- double sum;
- for (k = 1, sum = 0.; k <= l_ref; ++k) {
- int u;
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
- }
- s[l_query+1] = sum; // the last scaling factor
- }
- { // compute likelihood
- double p = 1., Pr1 = 0.;
- for (i = 0; i <= l_query + 1; ++i) {
- p *= s[i];
- if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
- }
- Pr1 += -4.343 * log(p * l_ref * l_query);
- Pr = (int)(Pr1 + .499);
- if (!is_backward) { // skip backward and MAP
- for (i = 0; i <= l_query; ++i) free(f[i]);
- free(f); free(s); free(_qual);
- return Pr;
- }
- }
- /*** backward ***/
- // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
- for (k = 1; k <= l_ref; ++k) {
- int u;
- double *bi = b[l_query];
- set_u(u, bw, l_query, k);
- if (u < 3 || u >= bw2*3+3) continue;
- bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
- }
- // b[l_query-1..1]
- for (i = l_query - 1; i >= 1; --i) {
- int beg = 1, end = l_ref, x, _beg, _end;
- double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
- uint8_t qyi1 = query[i+1];
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = end; k >= beg; --k) {
- int u, v11, v01, v10;
- double e;
- set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
- e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
- bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
- bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
- bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
-// fprintf(pysam_stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
- }
- // rescale
- set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
- for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
- }
- { // b[0]
- int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
- double sum = 0.;
- for (k = end; k >= beg; --k) {
- int u;
- double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
- set_u(u, bw, 1, k);
- if (u < 3 || u >= bw2*3+3) continue;
- sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
- }
- set_u(k, bw, 0, 0);
- pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
- }
- is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
- /*** MAP ***/
- for (i = 1; i <= l_query; ++i) {
- double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
- int beg = 1, end = l_ref, x, max_k = -1;
- x = i - bw; beg = beg > x? beg : x;
- x = i + bw; end = end < x? end : x;
- for (k = beg; k <= end; ++k) {
- int u;
- double z;
- set_u(u, bw, i, k);
- z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
- z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
- }
- max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
- if (state) state[i-1] = max_k;
- if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
-#ifdef _MAIN
- fprintf(pysam_stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
- "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
-#endif
- }
- /*** free ***/
- for (i = 0; i <= l_query; ++i) {
- free(f[i]); free(b[i]);
- }
- free(f); free(b); free(s); free(_qual);
- return Pr;
-}
-
-#ifdef _MAIN
-#include <unistd.h>
-int samtools_kprobaln_main(int argc, char *argv[])
-{
- uint8_t conv[256], *iqual, *ref, *query;
- int c, l_ref, l_query, i, q = 30, b = 10, P;
- while ((c = getopt(argc, argv, "b:q:")) >= 0) {
- switch (c) {
- case 'b': b = atoi(optarg); break;
- case 'q': q = atoi(optarg); break;
- }
- }
- if (optind + 2 > argc) {
- fprintf(pysam_stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
- return 1;
- }
- memset(conv, 4, 256);
- conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
- conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
- ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
- l_ref = strlen((char*)ref); l_query = strlen((char*)query);
- for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
- for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
- iqual = malloc(l_query);
- memset(iqual, q, l_query);
- kpa_par_def.bw = b;
- P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
- fprintf(pysam_stderr, "%d\n", P);
- free(iqual);
- return 0;
-}
-#endif
+++ /dev/null
-/* The MIT License
-
- Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk>
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
-
- The above copyright notice and this permission notice shall be
- included in all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
-*/
-
-#ifndef LH3_KPROBALN_H_
-#define LH3_KPROBALN_H_
-
-#include <stdint.h>
-
-typedef struct {
- float d, e;
- int bw;
-} kpa_par_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
- const kpa_par_t *c, int *state, uint8_t *q);
-
-#ifdef __cplusplus
-}
-#endif
-
-extern kpa_par_t kpa_par_def, kpa_par_alt;
-
-#endif
}
if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
- if (write_cns) puts(t[4].s); t[4].l = 0;
+ if (write_cns) {
+ if (t[4].l) puts(t[4].s);
+ t[4].l = 0;
+ }
} else if (strcmp(s.s, "AF") == 0) { // padded read position
int reversed, neg, pos;
if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
}
if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
ks_getuntil(ks, '\n', &s, &dret); // skip the empty line
- if (write_cns) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout); t[4].l = 0;
+ if (write_cns) {
+ if (t[4].l) fputs(t[4].s, pysam_stdout) & fputc('\n', pysam_stdout);
+ t[4].l = 0;
+ }
} else if (strcmp(s.s, "AF") == 0) { // padded read position
int reversed, neg, pos;
if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'");
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'),
{ NULL, 0, NULL, 0 }
};
fprintf(stderr, " Padded reference sequence file [null]\n");
fprintf(stderr, " -o FILE Output file name [stdout]\n");
fprintf(stderr, " -? Longer help\n");
- sam_global_opt_help(stderr, "-...-");
+ sam_global_opt_help(stderr, "-...--");
if (is_long_help)
fprintf(stderr,
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T'),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 'T', '-'),
{ NULL, 0, NULL, 0 }
};
fprintf(pysam_stderr, " Padded reference sequence file [null]\n");
fprintf(pysam_stderr, " -o FILE Output file name [pysam_stdout]\n");
fprintf(pysam_stderr, " -? Longer help\n");
- sam_global_opt_help(pysam_stderr, "-...-");
+ sam_global_opt_help(pysam_stderr, "-...--");
if (is_long_help)
fprintf(pysam_stderr,
#include <stdint.h>
#include <math.h>
#include <zlib.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "errmod.h"
#include "sam_opts.h"
#include "samtools.h"
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n");
fprintf(stderr, "\n");
- sam_global_opt_help(stderr, "-....");
+ sam_global_opt_help(stderr, "-....-");
return 1;
}
#include <stdint.h>
#include <math.h>
#include <zlib.h>
+#include "htslib/hts.h"
#include "htslib/sam.h"
#include "htslib/kstring.h"
-#include "errmod.h"
#include "sam_opts.h"
#include "samtools.h"
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0, '-'),
{ NULL, 0, NULL, 0 }
};
// fprintf(pysam_stderr, " -e do not discover SNPs (effective with -l)\n");
fprintf(pysam_stderr, "\n");
- sam_global_opt_help(pysam_stderr, "-....");
+ sam_global_opt_help(pysam_stderr, "-....-");
return 1;
}
samFile *file;
struct { BGZF *bam; } x; // Hack so that fp->x.bam still works
bam_hdr_t *header;
- short is_write:1;
+ unsigned short is_write:1;
} samfile_t;
#ifdef __cplusplus
r |= hts_opt_add((hts_opt **)&ga->out.specific, ref);
free(ref);
break;
+ } else if (strcmp(lopt->name, "threads") == 0) {
+ ga->nthreads = atoi(optarg);
+ break;
// } else if (strcmp(lopt->name, "verbose") == 0) {
// ga->verbosity++;
// break;
int i = 0;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0),
+ SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0,0),
{ NULL, 0, NULL, 0 }
};
else if (strcmp(lopts[i].name, "reference") == 0)
fprintf(fp,"reference FILE\n"
" Reference sequence FASTA FILE [null]\n");
+ else if (strcmp(lopts[i].name, "threads") == 0)
+ fprintf(fp,"threads INT\n"
+ " Number of additional threads to use [0]\n");
// else if (strcmp(lopts[i].name, "verbose") == 0)
// fprintf(fp,"verbose\n"
// " Increment level of verbosity\n");
r |= hts_opt_add((hts_opt **)&ga->out.specific, ref);
free(ref);
break;
+ } else if (strcmp(lopt->name, "threads") == 0) {
+ ga->nthreads = atoi(optarg);
+ break;
// } else if (strcmp(lopt->name, "verbose") == 0) {
// ga->verbosity++;
// break;
int i = 0;
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0),
+ SAM_OPT_GLOBAL_OPTIONS(0,0,0,0,0,0),
{ NULL, 0, NULL, 0 }
};
else if (strcmp(lopts[i].name, "reference") == 0)
fprintf(fp,"reference FILE\n"
" Reference sequence FASTA FILE [null]\n");
+ else if (strcmp(lopts[i].name, "threads") == 0)
+ fprintf(fp,"threads INT\n"
+ " Number of additional threads to use [0]\n");
// else if (strcmp(lopts[i].name, "verbose") == 0)
// fprintf(fp,"verbose\n"
// " Increment level of verbosity\n");
htsFormat in;
htsFormat out;
char *reference;
+ int nthreads;
//int verbosity;
} sam_global_args;
SAM_OPT_OUTPUT_FMT,
SAM_OPT_OUTPUT_FMT_OPTION,
SAM_OPT_REFERENCE,
+ SAM_OPT_NTHREADS,
//SAM_OPT_VERBOSE
};
// 0 No short option has been assigned. Use --long-opt only.
// '-' Both long and short options are disabled.
// <c> Otherwise the equivalent short option is character <c>.
-#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5) \
+#define SAM_OPT_GLOBAL_OPTIONS(o1, o2, o3, o4, o5, o6) \
{"input-fmt", required_argument, NULL, SAM_OPT_VAL(o1, SAM_OPT_INPUT_FMT)}, \
{"input-fmt-option", required_argument, NULL, SAM_OPT_VAL(o2, SAM_OPT_INPUT_FMT_OPTION)}, \
{"output-fmt", required_argument, NULL, SAM_OPT_VAL(o3, SAM_OPT_OUTPUT_FMT)}, \
{"output-fmt-option", required_argument, NULL, SAM_OPT_VAL(o4, SAM_OPT_OUTPUT_FMT_OPTION)}, \
- {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}
+ {"reference", required_argument, NULL, SAM_OPT_VAL(o5, SAM_OPT_REFERENCE)}, \
+ {"threads", required_argument, NULL, SAM_OPT_VAL(o6, SAM_OPT_NTHREADS)}
//{"verbose", no_argument, NULL, SAM_OPT_VERBOSE}
/*
--- /dev/null
+/* sam_utils.c -- various utilities internal to samtools.
+
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: John Marshall <jm18@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+
+#include "samtools.h"
+
+static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+{
+ fflush(stdout);
+ if (subcommand && *subcommand) fprintf(stderr, "samtools %s: ", subcommand);
+ else fprintf(stderr, "samtools: ");
+ vfprintf(stderr, format, args);
+ if (extra) fprintf(stderr, ": %s\n", extra);
+ else fprintf(stderr, "\n");
+ fflush(stderr);
+}
+
+void print_error(const char *subcommand, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, NULL);
+ va_end(args);
+}
+
+void print_error_errno(const char *subcommand, const char *format, ...)
+{
+ int err = errno;
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, err? strerror(err) : NULL);
+ va_end(args);
+}
--- /dev/null
+#include "pysam.h"
+
+/* sam_utils.c -- various utilities internal to samtools.
+
+ Copyright (C) 2014-2016 Genome Research Ltd.
+
+ Author: John Marshall <jm18@sanger.ac.uk>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+
+#include "samtools.h"
+
+static void vprint_error_core(const char *subcommand, const char *format, va_list args, const char *extra)
+{
+ fflush(pysam_stdout);
+ if (subcommand && *subcommand) fprintf(pysam_stderr, "samtools %s: ", subcommand);
+ else fprintf(pysam_stderr, "samtools: ");
+ vfprintf(pysam_stderr, format, args);
+ if (extra) fprintf(pysam_stderr, ": %s\n", extra);
+ else fprintf(pysam_stderr, "\n");
+ fflush(pysam_stderr);
+}
+
+void print_error(const char *subcommand, const char *format, ...)
+{
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, NULL);
+ va_end(args);
+}
+
+void print_error_errno(const char *subcommand, const char *format, ...)
+{
+ int err = errno;
+ va_list args;
+ va_start(args, format);
+ vprint_error_core(subcommand, format, args, err? strerror(err) : NULL);
+ va_end(args);
+}
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2015 Genome Research Ltd.
+ Copyright (C) 2009-2017 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <stdbool.h>
#include <assert.h>
#include <getopt.h>
+#include <ctype.h>
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
+#include "htslib/thread_pool.h"
#include "samtools.h"
#include "sam_opts.h"
+
+#define DEFAULT_BARCODE_TAG "BC"
+#define DEFAULT_QUALITY_TAG "QT"
+
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
int min_mapQ;
int flag_on;
int flag_off;
+ int flag_alloff;
int min_qlen;
int remove_B;
uint32_t subsam_seed;
}
if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off))
return 1;
+ if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
+ return 1;
if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b))))
return 1;
if (settings->subsam_frac > 0.) {
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0;
- int is_long_help = 0, n_threads = 0;
+ int is_long_help = 0;
int64_t count = 0;
samFile *in = 0, *out = 0, *un_out=0;
+ FILE *fp_out = NULL;
bam_hdr_t *header = NULL;
char out_mode[5], out_un_mode[5], *out_format = "";
char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
samview_settings_t settings = {
.rghash = NULL,
.min_mapQ = 0,
.flag_on = 0,
.flag_off = 0,
+ .flag_alloff = 0,
.min_qlen = 0,
.remove_B = 0,
.subsam_seed = 0,
};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'),
- { "threads", required_argument, NULL, '@' },
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
{ NULL, 0, NULL, 0 }
};
strcpy(out_mode, "w");
strcpy(out_un_mode, "w");
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
+ // Convert likely user input 0,1,2,... to pseudo-random
+ // values with more entropy and more bits set
srand(settings.subsam_seed);
settings.subsam_seed = rand();
}
case 'U': fn_un_out = strdup(optarg); break;
case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
*/
case '?': is_long_help = 1; break;
case 'B': settings.remove_B = 1; break;
- case '@': n_threads = strtol(optarg, 0, 0); break;
case 'x':
{
if (strlen(optarg) != 2) {
}
}
}
+ else {
+ if (fn_out) {
+ fp_out = fopen(fn_out, "w");
+ if (fp_out == NULL) {
+ print_error_errno("view", "can't create \"%s\"", fn_out);
+ ret = EXIT_FAILURE;
+ goto view_end;
+ }
+ }
+ }
- if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
+ if (ga.nthreads > 1) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(stderr, "Error creating thread pool\n");
+ ret = 1;
+ goto view_end;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
if (is_header_only) goto view_end; // no need to print alignments
if (optind + 1 >= argc) { // convert/print the entire file
}
view_end:
- if (is_count && ret == 0)
- printf("%" PRId64 "\n", count);
+ if (is_count && ret == 0) {
+ if (fprintf(fn_out? fp_out : stdout, "%" PRId64 "\n", count) < 0) {
+ if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out);
+ else print_error_errno("view", "writing to standard output failed");
+ ret = EXIT_FAILURE;
+ }
+ }
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
+ if (fp_out) fclose(fp_out);
free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
sam_global_args_free(&ga);
if (settings.remove_aux_len) {
free(settings.remove_aux);
}
+
+ if (p.pool)
+ hts_tpool_destroy(p.pool);
+
return ret;
}
" -l STR only include reads in library STR [null]\n"
" -m INT only include reads with number of CIGAR operations consuming\n"
" query sequence >= INT [0]\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
+" fraction of templates/read pairs to keep; INT part sets seed)\n"
// read processing
" -x STR read tag to strip (repeatable) [null]\n"
" -B collapse the backward CIGAR operation\n"
-" -s FLOAT integer part sets seed of random number generator [0];\n"
-" rest sets fraction of templates to subsample [no subsampling]\n"
// general options
-" -@, --threads INT\n"
-" number of BAM/CRAM compression threads [0]\n"
" -? print long help, including note about region specification\n"
" -S ignored (input format is auto-detected)\n");
- sam_global_opt_help(fp, "-.O.T");
+ sam_global_opt_help(fp, "-.O.T@");
fprintf(fp, "\n");
if (is_long_help)
"Usage: samtools %s [options...] <in.bam>\n", command);
fprintf(to,
"Options:\n"
-" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
-" -1 FILE write paired reads flagged READ1 to FILE\n"
-" -2 FILE write paired reads flagged READ2 to FILE\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
-" -n don't append /1 and /2 to the read name\n");
+" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
+" -1 FILE write paired reads flagged READ1 to FILE\n"
+" -2 FILE write paired reads flagged READ2 to FILE\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -n don't append /1 and /2 to the read name\n"
+" -N always append /1 and /2 to the read name\n");
if (fq) fprintf(to,
-" -O output quality in the OQ tag if present\n");
+" -O output quality in the OQ tag if present\n");
fprintf(to,
-" -s FILE write singleton reads to FILE [assume single-end]\n"
-" -t copy RG, BC and QT tags to the %s header line\n",
+" -s FILE write singleton reads to FILE [assume single-end]\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -v INT default quality score if not given in file [1]\n");
- sam_global_opt_help(to, "-.--.");
+" -v INT default quality score if not given in file [1]\n"
+" --i1 FILE write first index reads to FILE\n"
+" --i2 FILE write second index reads to FILE\n"
+" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
+" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
+" --index-format STR How to parse barcode and quality tags\n\n");
+ sam_global_opt_help(to, "-.--.@");
+ fprintf(to,
+" \n"
+" The index-format string describes how to parse the barcode and quality tags, for example:\n"
+" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n"
+" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n"
+" If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
+" 'read until the separator or end of tag', for example:\n"
+" n*i* ignore the left part of the tag until the separator, then use the second part\n"
+" of the tag as index 1\n");
}
typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
char *fnse;
char *fnr[3];
char *fn_input; // pointer to input filename in argv do not free
- bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ bool has12, has12always, use_oq, copy_tags;
+ int flag_on, flag_off, flag_alloff;
sam_global_args ga;
fastfile filetype;
int def_qual;
+ char *barcode_tag;
+ char *quality_tag;
+ char *index_file[2];
+ char *index_format;
} bam2fq_opts_t;
typedef struct bam2fq_state {
samFile *fp;
FILE *fpse;
FILE *fpr[3];
+ FILE *fpi[2];
bam_hdr_t *h;
bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ int flag_on, flag_off, flag_alloff;
fastfile filetype;
int def_qual;
} bam2fq_state_t;
+/*
+ * Get and decode the read from a BAM record.
+ *
+ * TODO: htslib really needs an interface for this. Consider this or perhaps
+ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
+ * functions as string formatted equivalents to bam_get_{seq,qual}?
+ */
+
+/*
+ * Reverse a string in place.
+ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
+ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
+ */
+static char *reverse(char *str)
+{
+ int i = strlen(str)-1,j=0;
+ char ch;
+ while (i>j) {
+ ch = str[i];
+ str[i]= str[j];
+ str[j] = ch;
+ i--;
+ j++;
+ }
+ return str;
+}
+
+/* return the read, reverse complemented if necessary */
+static char *get_read(const bam1_t *rec)
+{
+ int len = rec->core.l_qseq + 1;
+ char *read = calloc(1, len);
+ char *seq = (char *)bam_get_seq(rec);
+ int n;
+
+ if (!read) return NULL;
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
+ else read[n] = seq_nt16_str[bam_seqi(seq,n)];
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(read);
+ return read;
+}
+
+/*
+ * get and decode the quality from a BAM record
+ */
+static char *get_quality(const bam1_t *rec)
+{
+ char *quality = calloc(1, rec->core.l_qseq + 1);
+ char *q = (char *)bam_get_qual(rec);
+ int n;
+
+ if (*q == '\xff') { free(quality); return NULL; }
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ quality[n] = q[n]+33;
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(quality);
+ return quality;
+}
+
+//
+// End of htslib complaints
+//
+
+
static readpart which_readpart(const bam1_t *b)
{
if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
}
}
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+/*
+ * parse the length part from the index-format string
+ */
+static int getLength(char **s)
{
- int i;
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- uint8_t *seq;
- uint8_t *qual = bam_get_qual(b);
- const uint8_t *oq = NULL;
- if (state->use_oq) {
- oq = bam_aux_get(b, "OQ");
- if (oq) oq++; // skip tag type
+ int n = 0;
+ while (**s) {
+ if (**s == '*') { n=-1; (*s)++; break; }
+ if ( !isdigit(**s)) break;
+ n = n*10 + ((**s)-'0');
+ (*s)++;
}
- bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
+ return n;
+}
+
+static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int i;
linebuf->l = 0;
// Write read name
- readpart readpart = which_readpart(b);
kputc(state->filetype == FASTA? '>' : '@', linebuf);
- kputs(bam_get_qname(b), linebuf);
+ kputs(bam_get_qname(rec), linebuf);
// Add the /1 /2 if requested
if (state->has12) {
+ readpart readpart = which_readpart(rec);
if (readpart == READ_1) kputs("/1", linebuf);
else if (readpart == READ_2) kputs("/2", linebuf);
}
if (state->copy_tags) {
for (i = 0; copied_tags[i]; ++i) {
uint8_t *s;
- if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
- kputc('\t', linebuf);
- kputsn(copied_tags[i], 2, linebuf);
- kputsn(":Z:", 3, linebuf);
- kputs(bam_aux2Z(s), linebuf);
+ if ((s = bam_aux_get(rec, copied_tags[i])) != 0) {
+ if (*s == 'Z') {
+ kputc('\t', linebuf);
+ kputsn(copied_tags[i], 2, linebuf);
+ kputsn(":Z:", 3, linebuf);
+ kputs(bam_aux2Z(s), linebuf);
+ }
}
}
}
kputc('\n', linebuf);
-
- seq = bam_get_seq(b);
-
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
- kputc(c, linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- char c = seq_nt16_str[bam_seqi(seq,i)];
- kputc(c, linebuf);
- }
- }
+ kputs(seq, linebuf);
kputc('\n', linebuf);
if (state->filetype == FASTQ) {
// Write quality
kputs("+\n", linebuf);
- if (has_qual) {
- if (state->use_oq && oq) {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(oq[i], linebuf);
- }
- } else {
- kputs((char*)oq, linebuf);
- }
- } else {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(33 + qual[i], linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- kputc(33 + qual[i], linebuf);
- }
- }
- }
+ if (qual && *qual) {
+ kputs(qual, linebuf);
} else {
- for (i = 0; i < qlen; ++i) {
+ int len = strlen(seq);
+ for (i = 0; i < len; ++i) {
kputc(33 + state->def_qual, linebuf);
}
}
return true;
}
+/*
+ * Create FASTQ lines from the barcode tag using the index-format
+ */
+static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts)
+{
+ uint8_t *p;
+ char *ifmt = opts->index_format;
+ char *tag = NULL;
+ char *qual = NULL;
+ int file_number = 0;
+ kstring_t linebuf = { 0, 0, NULL }; // Buffer
+
+ // read barcode tag
+ p = bam_aux_get(rec,opts->barcode_tag);
+ if (p) tag = bam_aux2Z(p);
+
+ if (!tag) return true; // there is no tag
+
+ // read quality tag
+ p = bam_aux_get(rec, opts->quality_tag);
+ if (p) qual = bam_aux2Z(p);
+
+ // Parse the index-format string
+ while (*ifmt) {
+ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly
+ char action = *ifmt; // should be 'i' or 'n'
+ ifmt++; // skip over action
+ int index_len = getLength(&ifmt);
+
+ char *sub_tag = calloc(1, strlen(tag)+1);
+ char *sub_qual = calloc(1, strlen(tag)+1);
+ int n = 0;
+
+ if (index_len < 0) {
+ // read until separator
+ while (isalpha(*tag)) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ if (*tag) { // skip separator
+ tag++;
+ if (qual) qual++;
+ }
+ } else {
+ // read index_len characters
+ while (index_len-- && *tag) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ }
+
+ if (action=='i' && *sub_tag && state->fpi[file_number]) {
+ make_fq_line(rec, sub_tag, sub_qual, &linebuf, state);
+ fputs(linebuf.s, state->fpi[file_number++]);
+ }
+ free(sub_qual); free(sub_tag);
+
+ }
+
+ free(linebuf.s);
+ return true;
+}
+
+// Transform a bam1_t record into a string with the FASTQ representation of it
+// @returns false for error, true for success
+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int32_t qlen = b->core.l_qseq;
+ assert(qlen >= 0);
+ const uint8_t *oq = NULL;
+ char *qual = NULL;
+
+ char *seq = get_read(b);
+
+ if (state->use_oq) {
+ oq = bam_aux_get(b, "OQ");
+ if (oq) {
+ oq++;
+ qual = strdup(bam_aux2Z(oq));
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ reverse(qual);
+ }
+ }
+ } else {
+ qual = get_quality(b);
+ }
+
+ make_fq_line(b, seq, qual, linebuf, state);
+
+ free(qual);
+ free(seq);
+ return true;
+}
+
+static void free_opts(bam2fq_opts_t *opts)
+{
+ free(opts->barcode_tag);
+ free(opts->quality_tag);
+ free(opts->index_format);
+ free(opts);
+}
+
// return true if valid
static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
{
// Parse args
bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
opts->has12 = true;
+ opts->has12always = false;
opts->filetype = FASTQ;
opts->def_qual = 1;
+ opts->barcode_tag = NULL;
+ opts->quality_tag = NULL;
+ opts->index_format = NULL;
+ opts->index_file[0] = NULL;
+ opts->index_file[1] = NULL;
int c;
sam_global_args_init(&opts->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+ {"i1", required_argument, NULL, 1},
+ {"I1", required_argument, NULL, 1},
+ {"i2", required_argument, NULL, 2},
+ {"I2", required_argument, NULL, 2},
+ {"if", required_argument, NULL, 3},
+ {"IF", required_argument, NULL, 3},
+ {"index-format", required_argument, NULL, 3},
+ {"barcode-tag", required_argument, NULL, 'b'},
+ {"quality-tag", required_argument, NULL, 'q'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) {
+ while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) {
switch (c) {
+ case 'b': opts->barcode_tag = strdup(optarg); break;
+ case 'q': opts->quality_tag = strdup(optarg); break;
+ case 1 : opts->index_file[0] = optarg; break;
+ case 2 : opts->index_file[1] = optarg; break;
+ case 3 : opts->index_format = strdup(optarg); break;
case '0': opts->fnr[0] = optarg; break;
case '1': opts->fnr[1] = optarg; break;
case '2': opts->fnr[2] = optarg; break;
case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
case 'F': opts->flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
+ case 'N': opts->has12always = true; break;
case 'O': opts->use_oq = true; break;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(stderr, argv[0]); free(opts); return false;
+ case '?': bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(stderr, argv[0]); free(opts); return false;
+ bam2fq_usage(stderr, argv[0]); free_opts(opts); return false;
}
break;
}
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
+ if (opts->has12always) opts->has12 = true;
+
+ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
+ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+
+ int nIndex = 0;
+ if (opts->index_format) {
+ char *s;
+ for (s = opts->index_format; *s; s++) {
+ if (*s == 'i') nIndex++;
+ }
+ }
+ if (nIndex>2) {
+ fprintf(stderr,"Invalid index format: more than 2 indexes\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (opts->index_file[1] && !opts->index_file[0]) {
+ fprintf(stderr, "Index one specified, but index two not given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==2 && !opts->index_file[1]) {
+ fprintf(stderr, "index_format specifies two indexes, but only one index file given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==1 && !opts->index_file[0]) {
+ fprintf(stderr, "index_format specifies an index, but no index file given\n");
+ bam2fq_usage(stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
if (opts->def_qual < 0 || 93 < opts->def_qual) {
fprintf(stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
bam2fq_usage(stderr, argv[0]);
- free(opts);
- return true;
+ free_opts(opts);
+ return false;
}
const char* type_str = argv[0];
} else {
print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
bam2fq_usage(stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) == 0) {
+ fprintf(stderr, "No input file specified.\n");
bam2fq_usage(stdout, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) != 1) {
fprintf(stderr, "Too many arguments.\n");
bam2fq_usage(stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
opts->fn_input = argv[optind];
bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
+ state->flag_alloff = opts->flag_alloff;
state->has12 = opts->has12;
state->use_oq = opts->use_oq;
state->copy_tags = opts->copy_tags;
free(state);
return false;
}
+ if (opts->ga.nthreads > 0)
+ hts_set_threads(state->fp, opts->ga.nthreads);
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
if (opts->use_oq) rf |= SAM_AUX;
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
state->fpr[i] = stdout;
}
}
+ for (i = 0; i < 2; i++) {
+ state->fpi[i] = NULL;
+ if (opts->index_file[i]) {
+ state->fpi[i] = fopen(opts->index_file[i], "w");
+ if (state->fpi[i] == NULL) {
+ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]);
+ free(state);
+ return false;
+ }
+ }
+ }
state->h = sam_hdr_read(state->fp);
if (state->h == NULL) {
for (i = 0; i < 3; ++i) {
if (state->fpr[i] != stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
}
+ for (i = 0; i < 2; i++) {
+ if (state->fpi[i] && fclose(state->fpi[i])) {
+ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
+ valid = false;
+ }
+ }
free(state);
return valid;
}
{
return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
|| (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0);
+ || (b->core.flag&(state->flag_off)) != 0
+ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
}
-static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
+static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
bam1_t* b = bam_init1();
char *current_qname = NULL;
return false;
}
score[which_readpart(b)] = b_score;
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
}
if (!valid)
return valid;
}
-static bool bam2fq_mainloop(bam2fq_state_t *state)
+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
// process a name collated BAM into fastq
bam1_t* b = bam_init1();
int64_t n_reads = 0; // Statistics
kstring_t linebuf = { 0, 0, NULL }; // Buffer
while (sam_read1(state->fp, state->h, b) >= 0) {
- if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0) continue;
+ if (filter_it_out(b, state)) continue;
++n_reads;
if (!bam1_to_fq(b, &linebuf, state)) return false;
fputs(linebuf.s, state->fpr[which_readpart(b)]);
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
free(linebuf.s);
bam_destroy1(b);
if (!init_state(opts, &state)) return EXIT_FAILURE;
if (state->fpse) {
- if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE;
} else {
- if (!bam2fq_mainloop(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
}
if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
sam_global_args_free(&opts->ga);
- free(opts);
+ free_opts(opts);
return status;
}
/* sam_view.c -- SAM<->BAM<->CRAM conversion.
- Copyright (C) 2009-2015 Genome Research Ltd.
+ Copyright (C) 2009-2017 Genome Research Ltd.
Portions copyright (C) 2009, 2011, 2012 Broad Institute.
Author: Heng Li <lh3@sanger.ac.uk>
#include <stdlib.h>
#include <string.h>
+#include <strings.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <stdbool.h>
#include <assert.h>
#include <getopt.h>
+#include <ctype.h>
#include "htslib/sam.h"
#include "htslib/faidx.h"
#include "htslib/kstring.h"
#include "htslib/khash.h"
+#include "htslib/thread_pool.h"
#include "samtools.h"
#include "sam_opts.h"
+
+#define DEFAULT_BARCODE_TAG "BC"
+#define DEFAULT_QUALITY_TAG "QT"
+
KHASH_SET_INIT_STR(rg)
typedef khash_t(rg) *rghash_t;
int min_mapQ;
int flag_on;
int flag_off;
+ int flag_alloff;
int min_qlen;
int remove_B;
uint32_t subsam_seed;
}
if (b->core.qual < settings->min_mapQ || ((b->core.flag & settings->flag_on) != settings->flag_on) || (b->core.flag & settings->flag_off))
return 1;
+ if (settings->flag_alloff && ((b->core.flag & settings->flag_alloff) == settings->flag_alloff))
+ return 1;
if (settings->bed && (b->core.tid < 0 || !bed_overlap(settings->bed, h->target_name[b->core.tid], b->core.pos, bam_endpos(b))))
return 1;
if (settings->subsam_frac > 0.) {
int main_samview(int argc, char *argv[])
{
int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0;
- int is_long_help = 0, n_threads = 0;
+ int is_long_help = 0;
int64_t count = 0;
samFile *in = 0, *out = 0, *un_out=0;
+ FILE *fp_out = NULL;
bam_hdr_t *header = NULL;
char out_mode[5], out_un_mode[5], *out_format = "";
char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0;
sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
+ htsThreadPool p = {NULL, 0};
samview_settings_t settings = {
.rghash = NULL,
.min_mapQ = 0,
.flag_on = 0,
.flag_off = 0,
+ .flag_alloff = 0,
.min_qlen = 0,
.remove_B = 0,
.subsam_seed = 0,
};
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'),
- { "threads", required_argument, NULL, '@' },
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T', '@'),
{ NULL, 0, NULL, 0 }
};
strcpy(out_mode, "w");
strcpy(out_un_mode, "w");
while ((c = getopt_long(argc, argv,
- "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:",
+ "SbBcCt:h1Ho:O:q:f:F:G:ul:r:?T:R:L:s:@:m:x:U:",
lopts, NULL)) >= 0) {
switch (c) {
case 's':
if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) {
+ // Convert likely user input 0,1,2,... to pseudo-random
+ // values with more entropy and more bits set
srand(settings.subsam_seed);
settings.subsam_seed = rand();
}
case 'U': fn_un_out = strdup(optarg); break;
case 'f': settings.flag_on |= strtol(optarg, 0, 0); break;
case 'F': settings.flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': settings.flag_alloff |= strtol(optarg, 0, 0); break;
case 'q': settings.min_mapQ = atoi(optarg); break;
case 'u': compress_level = 0; break;
case '1': compress_level = 1; break;
*/
case '?': is_long_help = 1; break;
case 'B': settings.remove_B = 1; break;
- case '@': n_threads = strtol(optarg, 0, 0); break;
case 'x':
{
if (strlen(optarg) != 2) {
}
}
}
+ else {
+ if (fn_out) {
+ fp_out = fopen(fn_out, "w");
+ if (fp_out == NULL) {
+ print_error_errno("view", "can't create \"%s\"", fn_out);
+ ret = EXIT_FAILURE;
+ goto view_end;
+ }
+ }
+ }
- if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); }
+ if (ga.nthreads > 1) {
+ if (!(p.pool = hts_tpool_init(ga.nthreads))) {
+ fprintf(pysam_stderr, "Error creating thread pool\n");
+ ret = 1;
+ goto view_end;
+ }
+ hts_set_opt(in, HTS_OPT_THREAD_POOL, &p);
+ if (out) hts_set_opt(out, HTS_OPT_THREAD_POOL, &p);
+ }
if (is_header_only) goto view_end; // no need to print alignments
if (optind + 1 >= argc) { // convert/print the entire file
}
view_end:
- if (is_count && ret == 0)
- fprintf(pysam_stdout, "%" PRId64 "\n", count);
-
+ if (is_count && ret == 0) {
+ if (fprintf(fn_out? fp_out : pysam_stdout, "%" PRId64 "\n", count) < 0) {
+ if (fn_out) print_error_errno("view", "writing to \"%s\" failed", fn_out);
+ else print_error_errno("view", "writing to standard output failed");
+ ret = EXIT_FAILURE;
+ }
+ }
+
// close files, free and return
if (in) check_sam_close("view", in, fn_in, "standard input", &ret);
if (out) check_sam_close("view", out, fn_out, "standard output", &ret);
if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret);
+ if (fp_out) fclose(fp_out);
free(fn_list); free(fn_out); free(settings.library); free(fn_un_out);
sam_global_args_free(&ga);
if (settings.remove_aux_len) {
free(settings.remove_aux);
}
+
+ if (p.pool)
+ hts_tpool_destroy(p.pool);
+
return ret;
}
" -l STR only include reads in library STR [null]\n"
" -m INT only include reads with number of CIGAR operations consuming\n"
" query sequence >= INT [0]\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -s FLOAT subsample reads (given INT.FRAC option value, 0.FRAC is the\n"
+" fraction of templates/read pairs to keep; INT part sets seed)\n"
// read processing
" -x STR read tag to strip (repeatable) [null]\n"
" -B collapse the backward CIGAR operation\n"
-" -s FLOAT integer part sets seed of random number generator [0];\n"
-" rest sets fraction of templates to subsample [no subsampling]\n"
// general options
-" -@, --threads INT\n"
-" number of BAM/CRAM compression threads [0]\n"
" -? print long help, including note about region specification\n"
" -S ignored (input format is auto-detected)\n");
- sam_global_opt_help(fp, "-.O.T");
+ sam_global_opt_help(fp, "-.O.T@");
fprintf(fp, "\n");
if (is_long_help)
"Usage: samtools %s [options...] <in.bam>\n", command);
fprintf(to,
"Options:\n"
-" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
-" -1 FILE write paired reads flagged READ1 to FILE\n"
-" -2 FILE write paired reads flagged READ2 to FILE\n"
-" -f INT only include reads with all bits set in INT set in FLAG [0]\n"
-" -F INT only include reads with none of the bits set in INT set in FLAG [0]\n"
-" -n don't append /1 and /2 to the read name\n");
+" -0 FILE write paired reads flagged both or neither READ1 and READ2 to FILE\n"
+" -1 FILE write paired reads flagged READ1 to FILE\n"
+" -2 FILE write paired reads flagged READ2 to FILE\n"
+" -f INT only include reads with all of the FLAGs in INT present [0]\n" // F&x == x
+" -F INT only include reads with none of the FLAGS in INT present [0]\n" // F&x == 0
+" -G INT only EXCLUDE reads with all of the FLAGs in INT present [0]\n" // !(F&x == x)
+" -n don't append /1 and /2 to the read name\n"
+" -N always append /1 and /2 to the read name\n");
if (fq) fprintf(to,
-" -O output quality in the OQ tag if present\n");
+" -O output quality in the OQ tag if present\n");
fprintf(to,
-" -s FILE write singleton reads to FILE [assume single-end]\n"
-" -t copy RG, BC and QT tags to the %s header line\n",
+" -s FILE write singleton reads to FILE [assume single-end]\n"
+" -t copy RG, BC and QT tags to the %s header line\n",
fq ? "FASTQ" : "FASTA");
if (fq) fprintf(to,
-" -v INT default quality score if not given in file [1]\n");
- sam_global_opt_help(to, "-.--.");
+" -v INT default quality score if not given in file [1]\n"
+" --i1 FILE write first index reads to FILE\n"
+" --i2 FILE write second index reads to FILE\n"
+" --barcode-tag TAG Barcode tag [default: " DEFAULT_BARCODE_TAG "]\n"
+" --quality-tag TAG Quality tag [default: " DEFAULT_QUALITY_TAG "]\n"
+" --index-format STR How to parse barcode and quality tags\n\n");
+ sam_global_opt_help(to, "-.--.@");
+ fprintf(to,
+" \n"
+" The index-format string describes how to parse the barcode and quality tags, for example:\n"
+" i14i8 the first 14 characters are index 1, the next 8 characters are index 2\n"
+" n8i14 ignore the first 8 characters, and use the next 14 characters for index 1\n"
+" If the tag contains a separator, then the numeric part can be replaced with '*' to mean\n"
+" 'read until the separator or end of tag', for example:\n"
+" n*i* ignore the left part of the tag until the separator, then use the second part\n"
+" of the tag as index 1\n");
}
typedef enum { READ_UNKNOWN = 0, READ_1 = 1, READ_2 = 2 } readpart;
char *fnse;
char *fnr[3];
char *fn_input; // pointer to input filename in argv do not free
- bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ bool has12, has12always, use_oq, copy_tags;
+ int flag_on, flag_off, flag_alloff;
sam_global_args ga;
fastfile filetype;
int def_qual;
+ char *barcode_tag;
+ char *quality_tag;
+ char *index_file[2];
+ char *index_format;
} bam2fq_opts_t;
typedef struct bam2fq_state {
samFile *fp;
FILE *fpse;
FILE *fpr[3];
+ FILE *fpi[2];
bam_hdr_t *h;
bool has12, use_oq, copy_tags;
- int flag_on, flag_off;
+ int flag_on, flag_off, flag_alloff;
fastfile filetype;
int def_qual;
} bam2fq_state_t;
+/*
+ * Get and decode the read from a BAM record.
+ *
+ * TODO: htslib really needs an interface for this. Consider this or perhaps
+ * bam_get_seq_str (current vs original orientation) and bam_get_qual_str
+ * functions as string formatted equivalents to bam_get_{seq,qual}?
+ */
+
+/*
+ * Reverse a string in place.
+ * From http://stackoverflow.com/questions/8534274/is-the-strrev-function-not-available-in-linux.
+ * Author Sumit-naik: http://stackoverflow.com/users/4590926/sumit-naik
+ */
+static char *reverse(char *str)
+{
+ int i = strlen(str)-1,j=0;
+ char ch;
+ while (i>j) {
+ ch = str[i];
+ str[i]= str[j];
+ str[j] = ch;
+ i--;
+ j++;
+ }
+ return str;
+}
+
+/* return the read, reverse complemented if necessary */
+static char *get_read(const bam1_t *rec)
+{
+ int len = rec->core.l_qseq + 1;
+ char *read = calloc(1, len);
+ char *seq = (char *)bam_get_seq(rec);
+ int n;
+
+ if (!read) return NULL;
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ if (rec->core.flag & BAM_FREVERSE) read[n] = seq_nt16_str[seq_comp_table[bam_seqi(seq,n)]];
+ else read[n] = seq_nt16_str[bam_seqi(seq,n)];
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(read);
+ return read;
+}
+
+/*
+ * get and decode the quality from a BAM record
+ */
+static char *get_quality(const bam1_t *rec)
+{
+ char *quality = calloc(1, rec->core.l_qseq + 1);
+ char *q = (char *)bam_get_qual(rec);
+ int n;
+
+ if (*q == '\xff') { free(quality); return NULL; }
+
+ for (n=0; n < rec->core.l_qseq; n++) {
+ quality[n] = q[n]+33;
+ }
+ if (rec->core.flag & BAM_FREVERSE) reverse(quality);
+ return quality;
+}
+
+//
+// End of htslib complaints
+//
+
+
static readpart which_readpart(const bam1_t *b)
{
if ((b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) {
}
}
-// Transform a bam1_t record into a string with the FASTQ representation of it
-// @returns false for error, true for success
-static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+/*
+ * parse the length part from the index-format string
+ */
+static int getLength(char **s)
{
- int i;
- int32_t qlen = b->core.l_qseq;
- assert(qlen >= 0);
- uint8_t *seq;
- uint8_t *qual = bam_get_qual(b);
- const uint8_t *oq = NULL;
- if (state->use_oq) {
- oq = bam_aux_get(b, "OQ");
- if (oq) oq++; // skip tag type
+ int n = 0;
+ while (**s) {
+ if (**s == '*') { n=-1; (*s)++; break; }
+ if ( !isdigit(**s)) break;
+ n = n*10 + ((**s)-'0');
+ (*s)++;
}
- bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality
+ return n;
+}
+
+static bool make_fq_line(const bam1_t *rec, char *seq, char *qual, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int i;
linebuf->l = 0;
// Write read name
- readpart readpart = which_readpart(b);
kputc(state->filetype == FASTA? '>' : '@', linebuf);
- kputs(bam_get_qname(b), linebuf);
+ kputs(bam_get_qname(rec), linebuf);
// Add the /1 /2 if requested
if (state->has12) {
+ readpart readpart = which_readpart(rec);
if (readpart == READ_1) kputs("/1", linebuf);
else if (readpart == READ_2) kputs("/2", linebuf);
}
if (state->copy_tags) {
for (i = 0; copied_tags[i]; ++i) {
uint8_t *s;
- if ((s = bam_aux_get(b, copied_tags[i])) != 0) {
- kputc('\t', linebuf);
- kputsn(copied_tags[i], 2, linebuf);
- kputsn(":Z:", 3, linebuf);
- kputs(bam_aux2Z(s), linebuf);
+ if ((s = bam_aux_get(rec, copied_tags[i])) != 0) {
+ if (*s == 'Z') {
+ kputc('\t', linebuf);
+ kputsn(copied_tags[i], 2, linebuf);
+ kputsn(":Z:", 3, linebuf);
+ kputs(bam_aux2Z(s), linebuf);
+ }
}
}
}
kputc('\n', linebuf);
-
- seq = bam_get_seq(b);
-
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]];
- kputc(c, linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- char c = seq_nt16_str[bam_seqi(seq,i)];
- kputc(c, linebuf);
- }
- }
+ kputs(seq, linebuf);
kputc('\n', linebuf);
if (state->filetype == FASTQ) {
// Write quality
kputs("+\n", linebuf);
- if (has_qual) {
- if (state->use_oq && oq) {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(oq[i], linebuf);
- }
- } else {
- kputs((char*)oq, linebuf);
- }
- } else {
- if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
- for (i = qlen-1; i > -1; --i) {
- kputc(33 + qual[i], linebuf);
- }
- } else {
- for (i = 0; i < qlen; ++i) {
- kputc(33 + qual[i], linebuf);
- }
- }
- }
+ if (qual && *qual) {
+ kputs(qual, linebuf);
} else {
- for (i = 0; i < qlen; ++i) {
+ int len = strlen(seq);
+ for (i = 0; i < len; ++i) {
kputc(33 + state->def_qual, linebuf);
}
}
return true;
}
+/*
+ * Create FASTQ lines from the barcode tag using the index-format
+ */
+static bool tags2fq(bam1_t *rec, const bam2fq_state_t *state, const bam2fq_opts_t* opts)
+{
+ uint8_t *p;
+ char *ifmt = opts->index_format;
+ char *tag = NULL;
+ char *qual = NULL;
+ int file_number = 0;
+ kstring_t linebuf = { 0, 0, NULL }; // Buffer
+
+ // read barcode tag
+ p = bam_aux_get(rec,opts->barcode_tag);
+ if (p) tag = bam_aux2Z(p);
+
+ if (!tag) return true; // there is no tag
+
+ // read quality tag
+ p = bam_aux_get(rec, opts->quality_tag);
+ if (p) qual = bam_aux2Z(p);
+
+ // Parse the index-format string
+ while (*ifmt) {
+ if (file_number > 1) break; // shouldn't happen if we've validated paramaters correctly
+ char action = *ifmt; // should be 'i' or 'n'
+ ifmt++; // skip over action
+ int index_len = getLength(&ifmt);
+
+ char *sub_tag = calloc(1, strlen(tag)+1);
+ char *sub_qual = calloc(1, strlen(tag)+1);
+ int n = 0;
+
+ if (index_len < 0) {
+ // read until separator
+ while (isalpha(*tag)) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ if (*tag) { // skip separator
+ tag++;
+ if (qual) qual++;
+ }
+ } else {
+ // read index_len characters
+ while (index_len-- && *tag) {
+ sub_tag[n] = *tag++;
+ if (qual) sub_qual[n] = *qual++;
+ n++;
+ }
+ }
+
+ if (action=='i' && *sub_tag && state->fpi[file_number]) {
+ make_fq_line(rec, sub_tag, sub_qual, &linebuf, state);
+ fputs(linebuf.s, state->fpi[file_number++]);
+ }
+ free(sub_qual); free(sub_tag);
+
+ }
+
+ free(linebuf.s);
+ return true;
+}
+
+// Transform a bam1_t record into a string with the FASTQ representation of it
+// @returns false for error, true for success
+static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state)
+{
+ int32_t qlen = b->core.l_qseq;
+ assert(qlen >= 0);
+ const uint8_t *oq = NULL;
+ char *qual = NULL;
+
+ char *seq = get_read(b);
+
+ if (state->use_oq) {
+ oq = bam_aux_get(b, "OQ");
+ if (oq) {
+ oq++;
+ qual = strdup(bam_aux2Z(oq));
+ if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented
+ reverse(qual);
+ }
+ }
+ } else {
+ qual = get_quality(b);
+ }
+
+ make_fq_line(b, seq, qual, linebuf, state);
+
+ free(qual);
+ free(seq);
+ return true;
+}
+
+static void free_opts(bam2fq_opts_t *opts)
+{
+ free(opts->barcode_tag);
+ free(opts->quality_tag);
+ free(opts->index_format);
+ free(opts);
+}
+
// return true if valid
static bool parse_opts(int argc, char *argv[], bam2fq_opts_t** opts_out)
{
// Parse args
bam2fq_opts_t* opts = calloc(1, sizeof(bam2fq_opts_t));
opts->has12 = true;
+ opts->has12always = false;
opts->filetype = FASTQ;
opts->def_qual = 1;
+ opts->barcode_tag = NULL;
+ opts->quality_tag = NULL;
+ opts->index_format = NULL;
+ opts->index_file[0] = NULL;
+ opts->index_file[1] = NULL;
int c;
sam_global_args_init(&opts->ga);
static const struct option lopts[] = {
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
+ {"i1", required_argument, NULL, 1},
+ {"I1", required_argument, NULL, 1},
+ {"i2", required_argument, NULL, 2},
+ {"I2", required_argument, NULL, 2},
+ {"if", required_argument, NULL, 3},
+ {"IF", required_argument, NULL, 3},
+ {"index-format", required_argument, NULL, 3},
+ {"barcode-tag", required_argument, NULL, 'b'},
+ {"quality-tag", required_argument, NULL, 'q'},
{ NULL, 0, NULL, 0 }
};
- while ((c = getopt_long(argc, argv, "0:1:2:f:F:nOs:tv:", lopts, NULL)) > 0) {
+ while ((c = getopt_long(argc, argv, "0:1:2:f:F:G:nNOs:tv:@:", lopts, NULL)) > 0) {
switch (c) {
+ case 'b': opts->barcode_tag = strdup(optarg); break;
+ case 'q': opts->quality_tag = strdup(optarg); break;
+ case 1 : opts->index_file[0] = optarg; break;
+ case 2 : opts->index_file[1] = optarg; break;
+ case 3 : opts->index_format = strdup(optarg); break;
case '0': opts->fnr[0] = optarg; break;
case '1': opts->fnr[1] = optarg; break;
case '2': opts->fnr[2] = optarg; break;
case 'f': opts->flag_on |= strtol(optarg, 0, 0); break;
case 'F': opts->flag_off |= strtol(optarg, 0, 0); break;
+ case 'G': opts->flag_alloff |= strtol(optarg, 0, 0); break;
case 'n': opts->has12 = false; break;
+ case 'N': opts->has12always = true; break;
case 'O': opts->use_oq = true; break;
case 's': opts->fnse = optarg; break;
case 't': opts->copy_tags = true; break;
case 'v': opts->def_qual = atoi(optarg); break;
- case '?': bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false;
+ case '?': bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false;
default:
if (parse_sam_global_opt(c, optarg, lopts, &opts->ga) != 0) {
- bam2fq_usage(pysam_stderr, argv[0]); free(opts); return false;
+ bam2fq_usage(pysam_stderr, argv[0]); free_opts(opts); return false;
}
break;
}
}
if (opts->fnr[1] || opts->fnr[2]) opts->has12 = false;
+ if (opts->has12always) opts->has12 = true;
+
+ if (!opts->barcode_tag) opts->barcode_tag = strdup(DEFAULT_BARCODE_TAG);
+ if (!opts->quality_tag) opts->quality_tag = strdup(DEFAULT_QUALITY_TAG);
+
+ int nIndex = 0;
+ if (opts->index_format) {
+ char *s;
+ for (s = opts->index_format; *s; s++) {
+ if (*s == 'i') nIndex++;
+ }
+ }
+ if (nIndex>2) {
+ fprintf(pysam_stderr,"Invalid index format: more than 2 indexes\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (opts->index_file[1] && !opts->index_file[0]) {
+ fprintf(pysam_stderr, "Index one specified, but index two not given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==2 && !opts->index_file[1]) {
+ fprintf(pysam_stderr, "index_format specifies two indexes, but only one index file given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
+
+ if (nIndex==1 && !opts->index_file[0]) {
+ fprintf(pysam_stderr, "index_format specifies an index, but no index file given\n");
+ bam2fq_usage(pysam_stderr, argv[0]);
+ free_opts(opts);
+ return false;
+ }
if (opts->def_qual < 0 || 93 < opts->def_qual) {
fprintf(pysam_stderr, "Invalid -v default quality %i, allowed range 0 to 93\n", opts->def_qual);
bam2fq_usage(pysam_stderr, argv[0]);
- free(opts);
- return true;
+ free_opts(opts);
+ return false;
}
const char* type_str = argv[0];
} else {
print_error("bam2fq", "Unrecognised type call \"%s\", this should be impossible... but you managed it!", type_str);
bam2fq_usage(pysam_stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) == 0) {
+ fprintf(pysam_stderr, "No input file specified.\n");
bam2fq_usage(pysam_stdout, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
if ((argc - (optind)) != 1) {
fprintf(pysam_stderr, "Too many arguments.\n");
bam2fq_usage(pysam_stderr, argv[0]);
- free(opts);
+ free_opts(opts);
return false;
}
opts->fn_input = argv[optind];
bam2fq_state_t* state = calloc(1, sizeof(bam2fq_state_t));
state->flag_on = opts->flag_on;
state->flag_off = opts->flag_off;
+ state->flag_alloff = opts->flag_alloff;
state->has12 = opts->has12;
state->use_oq = opts->use_oq;
state->copy_tags = opts->copy_tags;
free(state);
return false;
}
+ if (opts->ga.nthreads > 0)
+ hts_set_threads(state->fp, opts->ga.nthreads);
uint32_t rf = SAM_QNAME | SAM_FLAG | SAM_SEQ | SAM_QUAL;
if (opts->use_oq) rf |= SAM_AUX;
if (hts_set_opt(state->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
state->fpr[i] = pysam_stdout;
}
}
+ for (i = 0; i < 2; i++) {
+ state->fpi[i] = NULL;
+ if (opts->index_file[i]) {
+ state->fpi[i] = fopen(opts->index_file[i], "w");
+ if (state->fpi[i] == NULL) {
+ print_error_errno("bam2fq", "Cannot write to i%d file \"%s\"", i+1, opts->index_file[i]);
+ free(state);
+ return false;
+ }
+ }
+ }
state->h = sam_hdr_read(state->fp);
if (state->h == NULL) {
for (i = 0; i < 3; ++i) {
if (state->fpr[i] != pysam_stdout && fclose(state->fpr[i])) { print_error_errno("bam2fq", "Error closing r%d file \"%s\"", i, opts->fnr[i]); valid = false; }
}
+ for (i = 0; i < 2; i++) {
+ if (state->fpi[i] && fclose(state->fpi[i])) {
+ print_error_errno("bam2fq", "Error closing i%d file \"%s\"", i+1, opts->index_file[i]);
+ valid = false;
+ }
+ }
free(state);
return valid;
}
{
return (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
|| (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0);
+ || (b->core.flag&(state->flag_off)) != 0
+ || (b->core.flag&(state->flag_alloff) && (b->core.flag&(state->flag_alloff)) == state->flag_alloff));
}
-static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state)
+static bool bam2fq_mainloop_singletontrack(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
bam1_t* b = bam_init1();
char *current_qname = NULL;
return false;
}
score[which_readpart(b)] = b_score;
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
}
if (!valid)
return valid;
}
-static bool bam2fq_mainloop(bam2fq_state_t *state)
+static bool bam2fq_mainloop(bam2fq_state_t *state, bam2fq_opts_t* opts)
{
// process a name collated BAM into fastq
bam1_t* b = bam_init1();
int64_t n_reads = 0; // Statistics
kstring_t linebuf = { 0, 0, NULL }; // Buffer
while (sam_read1(state->fp, state->h, b) >= 0) {
- if (b->core.flag&(BAM_FSECONDARY|BAM_FSUPPLEMENTARY) // skip secondary and supplementary alignments
- || (b->core.flag&(state->flag_on)) != state->flag_on // or reads indicated by filter flags
- || (b->core.flag&(state->flag_off)) != 0) continue;
+ if (filter_it_out(b, state)) continue;
++n_reads;
if (!bam1_to_fq(b, &linebuf, state)) return false;
fputs(linebuf.s, state->fpr[which_readpart(b)]);
+ if (state->fpi[0]) tags2fq(b, state, opts);
}
free(linebuf.s);
bam_destroy1(b);
if (!init_state(opts, &state)) return EXIT_FAILURE;
if (state->fpse) {
- if (!bam2fq_mainloop_singletontrack(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop_singletontrack(state,opts)) status = EXIT_FAILURE;
} else {
- if (!bam2fq_mainloop(state)) status = EXIT_FAILURE;
+ if (!bam2fq_mainloop(state,opts)) status = EXIT_FAILURE;
}
if (!destroy_state(opts, state, &status)) return EXIT_FAILURE;
sam_global_args_free(&opts->ga);
- free(opts);
+ free_opts(opts);
return status;
}
// reads. Mates mapped to different chromosomes have isize==0.
int32_t isize = bam_line->core.isize;
if ( isize<0 ) isize = -isize;
- if ( stats->info->nisize > 0 && isize >= stats->info->nisize )
- isize = stats->info->nisize-1;
+ if ( stats->info->nisize > 0 && isize > stats->info->nisize )
+ isize = stats->info->nisize;
if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
{
int pos_fst = bam_line->core.mpos - bam_line->core.pos;
stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
}
- if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
+ if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
if ( prev_tid==-1 || prev_tid!=tid )
{
prev_tid = tid;
printf(" -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
printf(" -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
- sam_global_opt_help(stdout, "-.--.");
+ sam_global_opt_help(stdout, "-.--.@");
printf("\n");
}
else
// .. bam
samFile* sam;
if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) {
- error("Failed to open: %s\n", bam_fname);
+ print_error_errno("stats", "failed to open \"%s\"", bam_fname);
return 1;
}
info->sam = sam;
info->sam_header = sam_hdr_read(sam);
if (info->sam_header == NULL) {
- error("Failed to read header for '%s'\n", bam_fname);
+ print_error("stats", "failed to read header for \"%s\"", bam_fname);
return 1;
}
return 0;
stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize = init_isize_t(info->nisize);
+ stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0);
stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t));
static const struct option loptions[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{"help", no_argument, NULL, 'h'},
{"remove-dups", no_argument, NULL, 'd'},
{"sam", no_argument, NULL, 's'},
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 )
+ while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:@:",loptions,NULL))>0 )
{
switch (opt)
{
}
if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
+ if (ga.nthreads > 0)
+ hts_set_threads(info->sam, ga.nthreads);
stats_t *all_stats = stats_init();
stats_t *curr_stats = NULL;
stats_t;
KHASH_MAP_INIT_STR(c2stats, stats_t*)
-static void error(const char *format, ...);
+static int error(const char *format, ...);
int is_in_regions(bam1_t *bam_line, stats_t *stats);
void realloc_buffers(stats_t *stats, int seq_len);
// reads. Mates mapped to different chromosomes have isize==0.
int32_t isize = bam_line->core.isize;
if ( isize<0 ) isize = -isize;
- if ( stats->info->nisize > 0 && isize >= stats->info->nisize )
- isize = stats->info->nisize-1;
+ if ( stats->info->nisize > 0 && isize > stats->info->nisize )
+ isize = stats->info->nisize;
if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
{
int pos_fst = bam_line->core.mpos - bam_line->core.pos;
stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos);
}
- if ( (sscanf(&line.s[i+1],"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
+ if ( (sscanf(&line.s[i+1],"%u %u",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n", &line.s[i+1]);
if ( prev_tid==-1 || prev_tid!=tid )
{
prev_tid = tid;
}
-static void error(const char *format, ...)
+static int error(const char *format, ...)
{
if ( !format )
{
fprintf(pysam_stdout, " -S, --split <tag> Also write statistics to separate files split by tagged field.\n");
fprintf(pysam_stdout, " -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n");
fprintf(pysam_stdout, " -x, --sparse Suppress outputting IS rows where there are no insertions.\n");
- sam_global_opt_help(pysam_stdout, "-.--.");
+ sam_global_opt_help(pysam_stdout, "-.--.@");
fprintf(pysam_stdout, "\n");
+ return(0);
}
else
{
// .. bam
samFile* sam;
if ((sam = sam_open_format(bam_fname, "r", in_fmt)) == 0) {
- error("Failed to open: %s\n", bam_fname);
+ print_error_errno("stats", "failed to open \"%s\"", bam_fname);
return 1;
}
info->sam = sam;
info->sam_header = sam_hdr_read(sam);
if (info->sam_header == NULL) {
- error("Failed to read header for '%s'\n", bam_fname);
+ print_error("stats", "failed to read header for \"%s\"", bam_fname);
return 1;
}
return 0;
stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t));
stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t));
stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t));
- stats->isize = init_isize_t(info->nisize);
+ stats->isize = init_isize_t(info->nisize ?info->nisize+1 :0);
stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t));
stats->mpc_buf = info->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
stats->acgtno_cycles = calloc(stats->nbases,sizeof(acgtno_count_t));
static const struct option loptions[] =
{
- SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
+ SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0, '@'),
{"help", no_argument, NULL, 'h'},
{"remove-dups", no_argument, NULL, 'd'},
{"sam", no_argument, NULL, 's'},
};
int opt;
- while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:",loptions,NULL))>0 )
+ while ( (opt=getopt_long(argc,argv,"?hdsxr:c:l:i:t:m:q:f:F:I:1:S:P:@:",loptions,NULL))>0 )
{
switch (opt)
{
case 'S': info->split_tag = optarg; break;
case 'P': info->split_prefix = optarg; break;
case '?':
- case 'h': error(NULL);
+ case 'h': return(error(NULL));
default:
if (parse_sam_global_opt(opt, optarg, loptions, &ga) != 0)
error("Unknown argument: %s\n", optarg);
if ( !bam_fname )
{
if ( isatty(STDIN_FILENO) )
- error(NULL);
+ return(error(NULL));
bam_fname = "-";
}
if (init_stat_info_fname(info, bam_fname, &ga.in)) return 1;
+ if (ga.nthreads > 0)
+ hts_set_threads(info->sam, ga.nthreads);
stats_t *all_stats = stats_init();
stats_t *curr_stats = NULL;
bool check_test_1(const bam_hdr_t* hdr) {
const char *test1_res =
"@HD\tVN:1.4\n"
- "@SQ\tSN:blah\n";
+ "@SQ\tSN:blah\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test1_res)) {
return false;
const char *test2_res =
"@HD\tVN:1.4\n"
"@SQ\tSN:blah\n"
- "@RG\tID:fish\n";
+ "@RG\tID:fish\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test2_res)) {
return false;
return true;
}
-int main(int argc, char**argv)
+int main(int argc, char *argv[])
{
// test state
const int NUM_TESTS = 2;
int failure = 0;
int getopt_char;
+ char *test_argv[] = { "test_filter_header_rg", "foo\tbar", "baz" };
+ char *arg_list = stringify_argv(3, test_argv);
while ((getopt_char = getopt(argc, argv, "v")) != -1) {
switch (getopt_char) {
case 'v':
// test
xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe
- bool result_1 = filter_header_rg(hdr1, id_to_keep_1);
+ bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list);
fclose(stderr);
if (verbose) printf("END RUN test 1\n");
// test
xfreopen(tempfname, "w", stderr); // Redirect stderr to pipe
- bool result_2 = filter_header_rg(hdr2, id_to_keep_2);
+ bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list);
fclose(stderr);
if (verbose) printf("END RUN test 2\n");
// Cleanup
free(res.s);
+ free(arg_list);
remove(tempfname);
if (failure > 0)
fprintf(orig_stderr, "%d failures %d successes\n", failure, success);
bool check_test_1(const bam_hdr_t* hdr) {
const char *test1_res =
"@HD\tVN:1.4\n"
- "@SQ\tSN:blah\n";
+ "@SQ\tSN:blah\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test1_res)) {
return false;
const char *test2_res =
"@HD\tVN:1.4\n"
"@SQ\tSN:blah\n"
- "@RG\tID:fish\n";
+ "@RG\tID:fish\n"
+ "@PG\tID:samtools\tPN:samtools\tVN:x.y.test\tCL:test_filter_header_rg foo bar baz\n";
if (strcmp(hdr->text, test2_res)) {
return false;
return true;
}
-int samtools_test_filter_header_rg_main(int argc, char**argv)
+int samtools_test_filter_header_rg_main(int argc, char *argv[])
{
// test state
const int NUM_TESTS = 2;
int failure = 0;
int getopt_char;
+ char *test_argv[] = { "test_filter_header_rg", "foo\tbar", "baz" };
+ char *arg_list = stringify_argv(3, test_argv);
while ((getopt_char = getopt(argc, argv, "v")) != -1) {
switch (getopt_char) {
case 'v':
// test
xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
- bool result_1 = filter_header_rg(hdr1, id_to_keep_1);
+ bool result_1 = filter_header_rg(hdr1, id_to_keep_1, arg_list);
fclose(pysam_stderr);
if (verbose) fprintf(pysam_stdout, "END RUN test 1\n");
// test
xfreopen(tempfname, "w", pysam_stderr); // Redirect pysam_stderr to pipe
- bool result_2 = filter_header_rg(hdr2, id_to_keep_2);
+ bool result_2 = filter_header_rg(hdr2, id_to_keep_2, arg_list);
fclose(pysam_stderr);
if (verbose) fprintf(pysam_stdout, "END RUN test 2\n");
// Cleanup
free(res.s);
+ free(arg_list);
remove(tempfname);
if (failure > 0)
fprintf(orig_pysam_stderr, "%d failures %d successes\n", failure, success);
/* test/test.c -- test harness utility routines.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014, 2016 Genome Research Ltd.
Author: Martin O. Pollard <mp15@sanger.ac.uk>
}
printf("text: \"%s\"\n", hdr->text);
}
+
+// For tests, just return a constant that can be embedded in expected output.
+const char *samtools_version(void)
+{
+ return "x.y.test";
+}
/* test/test.c -- test harness utility routines.
- Copyright (C) 2014 Genome Research Ltd.
+ Copyright (C) 2014, 2016 Genome Research Ltd.
Author: Martin O. Pollard <mp15@sanger.ac.uk>
}
fprintf(pysam_stdout, "text: \"%s\"\n", hdr->text);
}
+
+// For tests, just return a constant that can be embedded in expected output.
+const char *samtools_version(void)
+{
+ return "x.y.test";
+}
-#define SAMTOOLS_VERSION "1.3.1"
+#define SAMTOOLS_VERSION "1.4.1"
using cython and a high-level API for convenient access to the data
within standard genomic file formats.
-The current version wraps htslib-1.3.1, samtools-1.3.1 and bcftools-1.3.1.
+The current version wraps htslib-1.4.1, samtools-1.4.1 and bcftools-1.4.1.
See:
http://www.htslib.org
configure_script = os.path.join(library_dir, "configure")
+ on_rtd = os.environ.get("READTHEDOCS") == "True"
+ # RTD has no bzip2 development libraries installed:
+ if on_rtd:
+ env_options = "--disable-bz2"
+
if not os.path.exists(configure_script):
raise ValueError(
"configure script {} does not exist".format(configure_script))
# htslib built from sources included in the pysam
# package.
htslib_library_dirs = [
- 'pysam',
- ".",
+ "pysam", # when using setup.py develop?
+ ".", # when using setup.py develop?
os.path.join("build", distutils_dir_name("lib"), "pysam")]
htslib_include_dirs = ['htslib']
else:
raise ValueError("unknown HTSLIB value '%s'" % HTSLIB_MODE)
-internal_htslib_libraries = [os.path.splitext("chtslib{}".format(sysconfig.get_config_var('SO')))[0]]
+suffix = sysconfig.get_config_var('EXT_SUFFIX')
+if not suffix:
+ suffix = sysconfig.get_config_var('SO')
+internal_htslib_libraries = [os.path.splitext("chtslib{}".format(suffix))[0]]
+
+internal_tools_libraries = [
+ os.path.splitext("csamtools{}".format(suffix))[0],
+ os.path.splitext("cbcftools{}".format(suffix))[0],
+ ]
# build config.py
with open(os.path.join("pysam", "config.py"), "w") as outf:
if line.startswith("#define"):
key, value = re.match(
"#define (\S+)\s+(\S+)", line).groups()
- config_values[key] = int(value)
+ config_values[key] = value
for key in ["ENABLE_PLUGINS",
"HAVE_COMMONCRYPTO",
"HAVE_GMTIME_R",
shared_htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
- runtime_library_dirs=htslib_library_dirs,
include_dirs=["pysam", "."] + include_os + htslib_include_dirs,
libraries=external_htslib_libraries,
language="c",
"pysam.libcsamfile",
[source_pattern % "samfile",
"pysam/htslib_util.c",
- "pysam/samfile_util.c",
- "samtools/kprobaln.c"] +
+ "pysam/samfile_util.c"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
"pysam.libcalignmentfile",
[source_pattern % "alignmentfile",
"pysam/htslib_util.c",
- "pysam/samfile_util.c",
- "samtools/kprobaln.c"] +
+ "pysam/samfile_util.c"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
"pysam.libcalignedsegment",
[source_pattern % "alignedsegment",
"pysam/htslib_util.c",
- "pysam/samfile_util.c",
- "samtools/kprobaln.c"] +
+ "pysam/samfile_util.c"] +
htslib_sources +
os_c_files,
library_dirs=htslib_library_dirs,
define_macros=define_macros
)
+
+
cutils = Extension(
"pysam.libcutils",
[source_pattern % "utils", "pysam/pysam_util.c"] +
+ htslib_sources +
+ os_c_files,
+ library_dirs=["pysam"] + htslib_library_dirs,
+ include_dirs=["pysam", "."] +
+ include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries + internal_tools_libraries,
+ language="c",
+ extra_compile_args=extra_compile_args,
+ define_macros=define_macros
+)
+
+csamtools = Extension(
+ "pysam.libcsamtools",
+ [source_pattern % "samtools"] +
glob.glob(os.path.join("samtools", "*.pysam.c")) +
- # glob.glob(os.path.join("samtools", "*", "*.pysam.c")) +
+ htslib_sources +
+ os_c_files,
+ library_dirs=["pysam"] + htslib_library_dirs,
+ include_dirs=["samtools", "pysam", "."] +
+ include_os + htslib_include_dirs,
+ libraries=external_htslib_libraries + internal_htslib_libraries,
+ language="c",
+ extra_compile_args=extra_compile_args,
+ define_macros=define_macros
+)
+
+cbcftools = Extension(
+ "pysam.libcbcftools",
+ [source_pattern % "bcftools"] +
glob.glob(os.path.join("bcftools", "*.pysam.c")) +
- # glob.glob(os.path.join("bcftools", "*", "*.pysam.c")) +
htslib_sources +
os_c_files,
library_dirs=["pysam"] + htslib_library_dirs,
- include_dirs=["samtools", "bcftools", "pysam", "."] +
+ include_dirs=["bcftools", "pysam", "."] +
include_os + htslib_include_dirs,
libraries=external_htslib_libraries + internal_htslib_libraries,
language="c",
cbcf,
cbgzf,
cfaidx,
+ csamtools,
+ cbcftools,
cutils],
'cmdclass': cmdclass,
'package_dir': package_dirs,
def test_infer_query_length(self):
'''Test infer_query_length on M|=|X|I|D|H|S cigar ops'''
a = self.buildRead()
- a.cigarstring = '15M'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '15='
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '15X'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '5M5I5M'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '5M5D5M'
- self.assertEqual(a.infer_query_length(), 10)
- a.cigarstring = '5H10M'
- self.assertEqual(a.infer_query_length(), 15)
- a.cigarstring = '5S10M'
- self.assertEqual(a.infer_query_length(), 15)
+ a.cigarstring = '40M'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '40='
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '40X'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '20M5I20M'
+ self.assertEqual(a.infer_query_length(), 45)
+ a.cigarstring = '20M5D20M'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '5H35M'
+ self.assertEqual(a.infer_query_length(), 35)
+ a.cigarstring = '5S35M'
+ self.assertEqual(a.infer_query_length(), 40)
+ a.cigarstring = '35M5H'
+ self.assertEqual(a.infer_query_length(), 35)
+ a.cigarstring = '35M5S'
+ self.assertEqual(a.infer_query_length(), 40)
+
+ def test_infer_read_length(self):
+ '''Test infer_read_length on M|=|X|I|D|H|S cigar ops'''
+ a = self.buildRead()
+ a.cigarstring = '40M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '40='
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '40X'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '20M5I20M'
+ self.assertEqual(a.infer_read_length(), 45)
+ a.cigarstring = '20M5D20M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '5H35M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '5S35M'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '35M5H'
+ self.assertEqual(a.infer_read_length(), 40)
+ a.cigarstring = '35M5S'
+ self.assertEqual(a.infer_read_length(), 40)
def test_get_aligned_pairs_soft_clipping(self):
a = self.buildRead()
self.assertEqual(a.query_alignment_length, 20)
a.cigarstring = "20M1S"
self.assertEqual(a.query_alignment_length, 20)
+ a.cigarstring = "20M1H"
+ self.assertEqual(a.query_alignment_length, 20)
a.cigarstring = "1S20M"
self.assertEqual(a.query_alignment_length, 20)
+ a.cigarstring = "1H20M"
+ self.assertEqual(a.query_alignment_length, 20)
a.cigarstring = "1S20M1S"
self.assertEqual(a.query_alignment_length, 20)
+ a.cigarstring = "1H20M1H"
+ self.assertEqual(a.query_alignment_length, 20)
def test_query_length_is_limited(self):
a = self.buildRead()
a.query_name = "A" * 1
- a.query_name = "A" * 254
+ a.query_name = "A" * 251
self.assertRaises(
ValueError,
setattr,
a,
"query_name",
- "A" * 255)
+ "A" * 252)
class TestCigarStats(ReadTest):
self.assertEqual(s, p.tostring(pysamf))
+class TestEnums(unittest.TestCase):
+
+ def test_cigar_enums_are_defined(self):
+ self.assertEqual(pysam.CMATCH, 0)
+ self.assertEqual(pysam.CINS, 1)
+ self.assertEqual(pysam.CDEL, 2)
+ self.assertEqual(pysam.CREF_SKIP, 3)
+ self.assertEqual(pysam.CSOFT_CLIP, 4)
+ self.assertEqual(pysam.CHARD_CLIP, 5)
+ self.assertEqual(pysam.CPAD, 6)
+ self.assertEqual(pysam.CEQUAL, 7)
+ self.assertEqual(pysam.CDIFF, 8)
+ self.assertEqual(pysam.CBACK, 9)
+
+ def test_sam_flags_are_defined(self):
+ self.assertEqual(pysam.FPAIRED, 1)
+ self.assertEqual(pysam.FPROPER_PAIR, 2)
+ self.assertEqual(pysam.FUNMAP, 4)
+ self.assertEqual(pysam.FMUNMAP, 8)
+ self.assertEqual(pysam.FREVERSE, 16)
+ self.assertEqual(pysam.FMREVERSE, 32)
+ self.assertEqual(pysam.FREAD1, 64)
+ self.assertEqual(pysam.FREAD2, 128)
+ self.assertEqual(pysam.FSECONDARY, 256)
+ self.assertEqual(pysam.FQCFAIL, 512)
+ self.assertEqual(pysam.FDUP, 1024)
+ self.assertEqual(pysam.FSUPPLEMENTARY, 2048)
+
+
if __name__ == "__main__":
unittest.main()
input_filename,
reference_filename,
output_filename,
- input_mode, output_mode,
+ input_mode,
+ output_mode,
sequence_filename=None,
use_template=True,
- checkf=checkBinaryEqual):
+ checkf=checkBinaryEqual,
+ **kwargs):
'''iterate through *input_filename* writing to
*output_filename* and comparing the output to
*reference_filename*.
output_filename,
output_mode,
reference_filename=sequence_filename,
- template=infile)
+ template=infile, **kwargs)
else:
outfile = pysam.AlignmentFile(
output_filename,
reference_names=infile.references,
reference_lengths=infile.lengths,
reference_filename=sequence_filename,
- add_sq_text=False)
+ add_sq_text=False,
+ **kwargs)
iter = infile.fetch()
"tmp_ex2.sam",
"r", "wh")
+ def testSAM2SAMWithoutHeader(self):
+ self.checkEcho("ex2.sam",
+ "ex1.sam",
+ "tmp_ex2.sam",
+ "r", "w",
+ add_sam_header=False)
+
def testBAM2BAM(self):
self.checkEcho("ex2.bam",
"ex2.bam",
# self.checkEcho(input_filename, reference_filename, output_filename,
# "rb", "wb", use_template=False)
- # Release 0.8.0
- # no samfiles without header
- def testSAM2SAMWithoutHeader(self):
- self.checkEcho("ex2.sam",
- "ex1.sam",
- "tmp_ex2.sam",
- "r", "w")
-
def testReadSamWithoutTargetNames(self):
'''see issue 104.'''
input_filename = os.path.join(
input_filename, "r",
check_header=True)
- infile = pysam.AlignmentFile(
+ with pysam.AlignmentFile(
input_filename,
check_header=False,
- check_sq=False)
-
- # TODO
- # result = list(infile.fetch(until_eof=True))
- # self.assertEqual(2, len(result))
+ check_sq=False) as infile:
+ result = list(infile.fetch(until_eof=True))
+ self.assertEqual(2, len(result))
def testReadBamWithoutTargetNames(self):
'''see issue 104.'''
"r",
check_header=True)
- infile = pysam.AlignmentFile(
- input_filename, check_header=False, check_sq=False)
- result = list(infile.fetch(until_eof=True))
+ with pysam.AlignmentFile(
+ input_filename, check_sq=False) as infile:
+ result = list(infile.fetch(until_eof=True))
- # TODO
- def testReadSamWithoutHeader(self):
+ def test_fail_read_sam_without_header(self):
input_filename = os.path.join(DATADIR, "ex1.sam")
- # reading from a samfile without header is not
- # implemented
self.assertRaises(ValueError,
pysam.AlignmentFile,
input_filename,
"r")
- # TODO
- # without check_header header is no read
- # leading to segfault
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "r",
- # check_header=False)
+ def test_pass_read_sam_without_header_with_refs(self):
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.sam"),
+ "r",
+ reference_names=["chr1", "chr2"],
+ reference_lengths=[1575, 1584]) as samfile:
+ self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270)
- # TODO
- # def testReadUnformattedFile(self):
- # '''test reading from a file that is not bam/sam formatted'''
- # input_filename = os.path.join(DATADIR, 'Makefile')
-
- # # bam - file raise error
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "rb")
-
- # # sam - file error, but can't fetch
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "r")
-
- # self.assertRaises(ValueError,
- # pysam.AlignmentFile,
- # input_filename,
- # "r",
- # check_header=False)
+ def test_pass_read_sam_with_header_without_header_check(self):
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex2.sam"),
+ "r", check_header=False) as samfile:
+ self.assertEqual(len(list(samfile.fetch(until_eof=True))), 3270)
+
+ def test_fail_when_reading_unformatted_files(self):
+ '''test reading from a file that is not bam/sam formatted'''
+ input_filename = os.path.join(DATADIR, 'Makefile')
+
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "rb")
+
+ self.assertRaises(ValueError,
+ pysam.AlignmentFile,
+ input_filename,
+ "r")
def testBAMWithoutAlignedSegments(self):
'''see issue 117'''
check_sq=False)
samfile.fetch('chr2')
-
+ def test_fetch_by_tid(self):
+ with pysam.AlignmentFile(os.path.join(DATADIR, "ex1.bam"), "rb") as samfile:
+ self.assertEqual(len(list(samfile.fetch('chr1'))),
+ len(list(samfile.fetch(tid=0))))
+ self.assertEqual(len(list(samfile.fetch('chr2'))),
+ len(list(samfile.fetch(tid=1))))
+ self.assertRaises(
+ IndexError,
+ samfile.fetch,
+ tid=2)
+ self.assertRaises(
+ IndexError,
+ samfile.fetch,
+ tid=-1)
+ self.assertEqual(len(list(samfile.fetch('chr1',start=1000, end=2000))),
+ len(list(samfile.fetch(tid=0, start=1000, end=2000))))
+
class TestAutoDetect(unittest.TestCase):
# os.unlink(tmpfilename)
- def testBAMPerRead(self):
+ def test_pass_if_reads_binary_equal(self):
'''check if individual reads are binary equal.'''
infile = pysam.AlignmentFile(self.bamfile, "rb")
'''see pull request 50.'''
- def testTruncatedBam(self):
+ def testTruncatedBam2(self):
+ self.assertRaises(IOError,
+ pysam.AlignmentFile,
+ os.path.join(DATADIR, 'ex2_truncated.bam'))
- s = pysam.AlignmentFile(
- os.path.join(DATADIR, 'ex2_truncated.bam'))
+ def testTruncatedBam2(self):
+ s = pysam.AlignmentFile(os.path.join(DATADIR, 'ex2_truncated.bam'),
+ ignore_truncation=True)
iterall = lambda x: len([a for a in x])
self.assertRaises(IOError, iterall, s)
- def testTruncatedBamFetch(self):
- '''See comments for pull request at
- https://github.com/pysam-developers/pysam/pull/50#issuecomment-64928625
- '''
- # Currently there is no way to detect truncated
- # files through hts_iter_fetch, so this test is
- # disabled
- return
- s = pysam.AlignmentFile(
- os.path.join(DATADIR, 'ex2_truncated.bam'))
- iterall = lambda x: len([a for a in x])
- self.assertRaises(IOError, iterall, s.fetch())
COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204,
0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78,
+++ /dev/null
-#!/usr/bin/env python
-'''unit testing code for pysam.
-
-Execute in the :file:`tests` directory as it requires the Makefile
-and data files located there.
-'''
-
-import pysam
-import pysam.samtools
-import unittest
-import os
-import shutil
-import sys
-import collections
-import subprocess
-import logging
-import array
-from TestUtils import checkBinaryEqual, checkURL, force_str
-
-DATADIR = "pysam_data"
-
-
-class BasicTestBAMFetch(unittest.TestCase):
-
- '''basic first test - detailed testing
- if information in file is consistent
- with information in AlignedRead object.'''
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.bam"),
- "rb")
- self.reads = list(self.samfile.fetch())
-
- def testARqname(self):
- self.assertEqual(
- self.reads[0].qname,
- "read_28833_29006_6945",
- "read name mismatch in read 1: %s != %s" % (
- self.reads[0].qname, "read_28833_29006_6945"))
- self.assertEqual(
- self.reads[1].qname,
- "read_28701_28881_323b",
- "read name mismatch in read 2: %s != %s" % (
- self.reads[1].qname, "read_28701_28881_323b"))
-
- def testARflag(self):
- self.assertEqual(
- self.reads[0].flag, 99,
- "flag mismatch in read 1: %s != %s" % (
- self.reads[0].flag, 99))
- self.assertEqual(
- self.reads[1].flag, 147,
- "flag mismatch in read 2: %s != %s" % (
- self.reads[1].flag, 147))
-
- def testARrname(self):
- self.assertEqual(
- self.reads[0].rname, 0,
- "chromosome/target id mismatch in read 1: %s != %s" %
- (self.reads[0].rname, 0))
- self.assertEqual(
- self.reads[1].rname, 1,
- "chromosome/target id mismatch in read 2: %s != %s" %
- (self.reads[1].rname, 1))
-
- def testARpos(self):
- self.assertEqual(
- self.reads[0].pos, 33 - 1,
- "mapping position mismatch in read 1: %s != %s" %
- (self.reads[0].pos, 33 - 1))
- self.assertEqual(
- self.reads[1].pos, 88 - 1,
- "mapping position mismatch in read 2: %s != %s" %
- (self.reads[1].pos, 88 - 1))
-
- def testARmapq(self):
- self.assertEqual(
- self.reads[0].mapq, 20,
- "mapping quality mismatch in read 1: %s != %s" %
- (self.reads[0].mapq, 20))
- self.assertEqual(
- self.reads[1].mapq, 30,
- "mapping quality mismatch in read 2: %s != %s" % (
- self.reads[1].mapq, 30))
-
- def testARcigar(self):
- self.assertEqual(
- self.reads[0].cigar,
- [(0, 10), (2, 1), (0, 25)],
- "read name length mismatch in read 1: %s != %s" %
- (self.reads[0].cigar, [(0, 10), (2, 1), (0, 25)]))
- self.assertEqual(
- self.reads[1].cigar, [(0, 35)],
- "read name length mismatch in read 2: %s != %s" %
- (self.reads[1].cigar, [(0, 35)]))
-
- def testARcigarstring(self):
- self.assertEqual(self.reads[0].cigarstring, '10M1D25M')
- self.assertEqual(self.reads[1].cigarstring, '35M')
-
- def testARmrnm(self):
- self.assertEqual(
- self.reads[0].mrnm, 0,
- "mate reference sequence name mismatch in read 1: %s != %s" %
- (self.reads[0].mrnm, 0))
- self.assertEqual(
- self.reads[1].mrnm, 1,
- "mate reference sequence name mismatch in read 2: %s != %s" %
- (self.reads[1].mrnm, 1))
- self.assertEqual(
- self.reads[0].rnext, 0,
- "mate reference sequence name mismatch in read 1: %s != %s" %
- (self.reads[0].rnext, 0))
- self.assertEqual(
- self.reads[1].rnext, 1,
- "mate reference sequence name mismatch in read 2: %s != %s" %
- (self.reads[1].rnext, 1))
-
- def testARmpos(self):
- self.assertEqual(self.reads[
- 0].mpos, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].mpos, 200 - 1))
- self.assertEqual(self.reads[
- 1].mpos, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].mpos, 500 - 1))
- self.assertEqual(self.reads[
- 0].pnext, 200 - 1, "mate mapping position mismatch in read 1: %s != %s" % (self.reads[0].pnext, 200 - 1))
- self.assertEqual(self.reads[
- 1].pnext, 500 - 1, "mate mapping position mismatch in read 2: %s != %s" % (self.reads[1].pnext, 500 - 1))
-
- def testARisize(self):
- self.assertEqual(self.reads[0].isize, 167, "insert size mismatch in read 1: %s != %s" % (
- self.reads[0].isize, 167))
- self.assertEqual(self.reads[1].isize, 412, "insert size mismatch in read 2: %s != %s" % (
- self.reads[1].isize, 412))
- self.assertEqual(self.reads[0].tlen, 167, "insert size mismatch in read 1: %s != %s" % (
- self.reads[0].tlen, 167))
- self.assertEqual(self.reads[1].tlen, 412, "insert size mismatch in read 2: %s != %s" % (
- self.reads[1].tlen, 412))
-
- def testARseq(self):
- self.assertEqual(self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 1: %s != %s" % (
- self.reads[0].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
- self.assertEqual(self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "sequence size mismatch in read 2: %s != %s" % (
- self.reads[1].seq, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"))
- self.assertEqual(self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "sequence mismatch in read 4: %s != %s" % (
- self.reads[3].seq, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
-
- def testARqual(self):
- self.assertEqual(self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "quality string mismatch in read 1: %s != %s" % (self.reads[0].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
- self.assertEqual(self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<", "quality string mismatch in read 2: %s != %s" % (
- self.reads[1].qual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
- self.assertEqual(self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "quality string mismatch in read 3: %s != %s" % (self.reads[3].qual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
-
- def testARquery(self):
- self.assertEqual(self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG", "query mismatch in read 1: %s != %s" % (
- self.reads[0].query, "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"))
- self.assertEqual(self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA", "query size mismatch in read 2: %s != %s" % (
- self.reads[1].query, "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"))
- self.assertEqual(self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT", "query mismatch in read 4: %s != %s" % (
- self.reads[3].query, "TAGCTAGCTACCTATATCTTGGTCTT"))
-
- def testARqqual(self):
- self.assertEqual(
- self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<",
- "qquality string mismatch in read 1: %s != %s" %
- (self.reads[0].qqual, "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"))
- self.assertEqual(
- self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<",
- "qquality string mismatch in read 2: %s != %s" %
- (self.reads[1].qqual, "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"))
- self.assertEqual(
- self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22",
- "qquality string mismatch in read 3: %s != %s" %
- (self.reads[3].qqual, "<<<<<<<<<<<<<<<<<:<9/,&,22"))
-
- def testPresentOptionalFields(self):
- self.assertEqual(
- self.reads[0].opt('NM'), 1,
- "optional field mismatch in read 1, NM: %s != %s" %
- (self.reads[0].opt('NM'), 1))
- self.assertEqual(
- self.reads[0].opt('RG'), 'L1',
- "optional field mismatch in read 1, RG: %s != %s" %
- (self.reads[0].opt('RG'), 'L1'))
- self.assertEqual(
- self.reads[1].opt('RG'), 'L2',
- "optional field mismatch in read 2, RG: %s != %s" %
- (self.reads[1].opt('RG'), 'L2'))
- self.assertEqual(
- self.reads[1].opt('MF'), 18,
- "optional field mismatch in read 2, MF: %s != %s" %
- (self.reads[1].opt('MF'), 18))
-
- def testPairedBools(self):
- self.assertEqual(self.reads[0].is_paired, True,
- "is paired mismatch in read 1: %s != %s" % (
- self.reads[0].is_paired, True))
- self.assertEqual(self.reads[1].is_paired, True,
- "is paired mismatch in read 2: %s != %s" % (
- self.reads[1].is_paired, True))
- self.assertEqual(self.reads[0].is_proper_pair, True,
- "is proper pair mismatch in read 1: %s != %s" % (
- self.reads[0].is_proper_pair, True))
- self.assertEqual(self.reads[1].is_proper_pair, True,
- "is proper pair mismatch in read 2: %s != %s" % (
- self.reads[1].is_proper_pair, True))
-
- def testTags(self):
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')])
- self.assertEqual(self.reads[1].tags,
- [('MF', 18), ('RG', 'L2'),
- ('PG', 'P2'), ('XT', 'R')])
-
- def testAddTags(self):
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')]))
-
- self.reads[0].setTag('X1', 'C')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 'C'), ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
- self.reads[0].setTag('X2', 5)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 5), ('X1', 'C'),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
- # add with replacement
- self.reads[0].setTag('X2', 10)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 10), ('X1', 'C'),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
-
- # add without replacement
- self.reads[0].setTag('X2', 5, replace=False)
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X2', 10), ('X1', 'C'),
- ('X2', 5),
- ('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ]))
-
- def testAddTagsType(self):
- self.reads[0].tags = None
- self.assertEqual(self.reads[0].tags, [])
-
- self.reads[0].setTag('X1', 5.0)
- self.reads[0].setTag('X2', "5.0")
- self.reads[0].setTag('X3', 5)
-
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5)]))
-
- # test setting float for int value
- self.reads[0].setTag('X4', 5, value_type='d')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5),
- ('X4', 5.0)]))
-
- # test setting int for float value - the
- # value will be rounded.
- self.reads[0].setTag('X5', 5.2, value_type='i')
- self.assertEqual(sorted(self.reads[0].tags),
- sorted([('X1', 5.0),
- ('X2', "5.0"),
- ('X3', 5),
- ('X4', 5.0),
- ('X5', 5)]))
-
- # test setting invalid type code
- self.assertRaises(ValueError, self.reads[0].setTag, 'X6', 5.2, 'g')
-
- def testTagsUpdatingFloat(self):
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U')])
- self.reads[0].tags += [('XC', 5.0)]
- self.assertEqual(self.reads[0].tags,
- [('NM', 1), ('RG', 'L1'),
- ('PG', 'P1'), ('XT', 'U'), ('XC', 5.0)])
-
- def testOpt(self):
- self.assertEqual(self.reads[0].opt("XT"), "U")
- self.assertEqual(self.reads[1].opt("XT"), "R")
-
- def testMissingOpt(self):
- self.assertRaises(KeyError, self.reads[0].opt, "XP")
-
- def testEmptyOpt(self):
- self.assertRaises(KeyError, self.reads[2].opt, "XT")
-
- def tearDown(self):
- self.samfile.close()
-
-
-class BasicTestBAMFile(BasicTestBAMFetch):
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = [r for r in self.samfile]
-
-
-class BasicTestSAMFile(BasicTestBAMFetch):
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = [r for r in self.samfile]
-
-
-class BasicTestSAMFetch(BasicTestBAMFetch):
-
- def setUp(self):
- self.samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex3.sam"),
- "r")
- self.reads = list(self.samfile.fetch())
-
-
-# needs to be implemented
-# class TestAlignedReadFromSamWithoutHeader(TestAlignedReadFromBam):
-#
-# def setUp(self):
-# self.samfile=pysam.Samfile( "ex7.sam","r" )
-# self.reads=list(self.samfile.fetch())
-
-
-class TestIO(unittest.TestCase):
-
- '''check if reading samfile and writing a samfile are consistent.'''
-
- def checkEcho(self,
- input_filename,
- reference_filename,
- output_filename,
- input_mode, output_mode,
- use_template=True):
- '''iterate through *input_filename* writing to *output_filename* and
- comparing the output to *reference_filename*.
-
- The files are opened according to the *input_mode* and *output_mode*.
-
- If *use_template* is set, the header is copied from infile
- using the template mechanism, otherwise target names and
- lengths are passed explicitly.
-
- '''
-
- infile = pysam.Samfile(os.path.join(DATADIR, input_filename),
- input_mode)
- if use_template:
- outfile = pysam.Samfile(output_filename,
- output_mode,
- template=infile)
- else:
- outfile = pysam.Samfile(output_filename,
- output_mode,
- referencenames=infile.references,
- referencelengths=infile.lengths,
- add_sq_text=False)
-
- iter = infile.fetch()
-
- for x in iter:
- outfile.write(x)
- infile.close()
- outfile.close()
-
- self.assertTrue(
- checkBinaryEqual(os.path.join(DATADIR, reference_filename),
- output_filename),
- "files %s and %s are not the same" % (reference_filename,
- output_filename))
-
- def testReadWriteBam(self):
-
- input_filename = "ex1.bam"
- output_filename = "pysam_ex1.bam"
- reference_filename = "ex1.bam"
-
- self.checkEcho(input_filename, reference_filename, output_filename,
- "rb", "wb", use_template=True)
-
- # Disabled - should work, files are not binary equal, but are
- # non-binary equal:
- # diff <(samtools view pysam_ex1.bam) <(samtools view pysam_data/ex1.bam)
- # def testReadWriteBamWithTargetNames(self):
- # input_filename = "ex1.bam"
- # output_filename = "pysam_ex1.bam"
- # reference_filename = "ex1.bam"
-
- # self.checkEcho(input_filename, reference_filename, output_filename,
- # "rb", "wb", use_template=False)
-
- def testReadWriteSamWithHeader(self):
-
- input_filename = "ex2.sam"
- output_filename = "pysam_ex2.sam"
- reference_filename = "ex2.sam"
-
- self.checkEcho(input_filename,
- reference_filename,
- output_filename,
- "r", "wh")
-
- # Release 0.8.0
- # no samfiles without header
- def testReadWriteSamWithoutHeader(self):
-
- input_filename = "ex2.sam"
- output_filename = "pysam_ex2.sam"
- reference_filename = "ex1.sam"
-
- self.checkEcho(input_filename,
- reference_filename,
- output_filename,
- "r", "w")
-
- def testReadSamWithoutTargetNames(self):
- '''see issue 104.'''
- input_filename = os.path.join(DATADIR,
- "example_unmapped_reads_no_sq.sam")
-
- # raise exception in default mode
- self.assertRaises(ValueError, pysam.Samfile, input_filename, "r")
-
- # raise exception if no SQ files
- self.assertRaises(ValueError, pysam.Samfile,
- input_filename, "r",
- check_header=True)
-
- infile = pysam.Samfile(
- input_filename,
- check_header=False,
- check_sq=False)
-
- # TODO
- # result = list(infile.fetch(until_eof=True))
- # self.assertEqual(2, len(result))
-
- def testReadBamWithoutTargetNames(self):
- '''see issue 104.'''
- input_filename = os.path.join(
- DATADIR, "example_unmapped_reads_no_sq.bam")
-
- # raise exception in default mode
- self.assertRaises(ValueError, pysam.Samfile, input_filename, "r")
-
- # raise exception if no SQ files
- self.assertRaises(ValueError, pysam.Samfile, input_filename, "r",
- check_header=True)
-
- infile = pysam.Samfile(
- input_filename, check_header=False, check_sq=False)
- result = list(infile.fetch(until_eof=True))
-
- # TODO
- def testReadSamWithoutHeader(self):
- input_filename = os.path.join(DATADIR, "ex1.sam")
-
- # reading from a samfile without header is not
- # implemented
- self.assertRaises(ValueError,
- pysam.Samfile,
- input_filename,
- "r")
-
- # TODO
- # without check_header header is no read
- # leading to segfault
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "r",
- # check_header=False)
-
- # TODO
- # def testReadUnformattedFile(self):
- # '''test reading from a file that is not bam/sam formatted'''
- # input_filename = os.path.join(DATADIR, 'Makefile')
-
- # # bam - file raise error
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "rb")
-
- # # sam - file error, but can't fetch
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "r")
-
- # self.assertRaises(ValueError,
- # pysam.Samfile,
- # input_filename,
- # "r",
- # check_header=False)
-
- def testBAMWithoutAlignedReads(self):
- '''see issue 117'''
- input_filename = os.path.join(DATADIR, "test_unaligned.bam")
- samfile = pysam.Samfile(input_filename, "rb", check_sq=False)
- samfile.fetch(until_eof=True)
-
- def testBAMWithShortBAI(self):
- '''see issue 116'''
- input_filename = os.path.join(DATADIR, "example_bai.bam")
- samfile = pysam.Samfile(input_filename, "rb", check_sq=False)
- samfile.fetch('chr2')
-
- def testFetchFromClosedFile(self):
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- samfile.close()
- self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
-
- def testClosedFile(self):
- '''test that access to a closed samfile raises ValueError.'''
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- samfile.close()
- self.assertRaises(ValueError, samfile.fetch, 'chr1', 100, 120)
- self.assertRaises(ValueError, samfile.pileup, 'chr1', 100, 120)
- self.assertRaises(ValueError, samfile.getrname, 0)
- # TODO
- self.assertRaises(ValueError, samfile.tell)
- self.assertRaises(ValueError, samfile.seek, 0)
- self.assertRaises(ValueError, getattr, samfile, "nreferences")
- self.assertRaises(ValueError, getattr, samfile, "references")
- self.assertRaises(ValueError, getattr, samfile, "lengths")
- self.assertRaises(ValueError, getattr, samfile, "text")
- self.assertRaises(ValueError, getattr, samfile, "header")
-
- # write on closed file
- self.assertEqual(0, samfile.write(None))
-
- def testAutoDetection(self):
- '''test if autodetection works.'''
-
- # TODO
- # samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.sam"))
- # self.assertRaises(ValueError, samfile.fetch, 'chr1')
- # samfile.close()
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.bam"))
- samfile.fetch('chr1')
- samfile.close()
-
- # TOOD
- # def testReadingFromSamFileWithoutHeader(self):
- # '''read from samfile without header.
- # '''
- # samfile = pysam.Samfile(os.path.join(DATADIR, "ex7.sam"),
- # check_header=False,
- # check_sq=False)
- # self.assertRaises(NotImplementedError, samfile.__iter__)
-
- def testReadingFromFileWithoutIndex(self):
- '''read from bam file without index.'''
-
- shutil.copyfile(os.path.join(DATADIR, "ex2.bam"), 'tmp_ex2.bam')
- samfile = pysam.Samfile('tmp_ex2.bam',
- "rb")
- self.assertRaises(ValueError, samfile.fetch)
- self.assertEqual(len(list(samfile.fetch(until_eof=True))),
- 3270)
- os.unlink('tmp_ex2.bam')
-
- # def testReadingUniversalFileMode(self):
- # '''read from samfile without header.
- # '''
-
- # input_filename = "ex2.sam"
- # output_filename = "pysam_ex2.sam"
- # reference_filename = "ex1.sam"
-
- # self.checkEcho(input_filename,
- # reference_filename,
- # output_filename,
- # "rU", "w")
-
- def testHead(self):
- '''test IteratorRowHead'''
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- l10 = list(samfile.head(10))
- l100 = list(samfile.head(100))
- self.assertEqual(len(l10), 10)
- self.assertEqual(len(l100), 100)
- self.assertEqual(list(map(str, l10)),
- list(map(str, l100[:10])))
-
-
-class TestFloatTagBug(unittest.TestCase):
-
- '''see issue 71'''
-
- def testFloatTagBug(self):
- '''a float tag before another exposed a parsing bug in bam_aux_get.
-
- Fixed in 0.1.19
- '''
- samfile = pysam.Samfile(os.path.join(DATADIR, "tag_bug.bam"))
- read = next(samfile.fetch(until_eof=True))
- self.assertTrue(('XC', 1) in read.tags)
- self.assertEqual(read.opt('XC'), 1)
-
-
-class TestLargeFieldBug(unittest.TestCase):
-
- '''see issue 100'''
-
- def testLargeFileBug(self):
- '''when creating a read with a large entry in the tag field
- causes an errror:
- NotImplementedError: tags field too large
- '''
- samfile = pysam.Samfile(os.path.join(DATADIR, "issue100.bam"))
- read = next(samfile.fetch(until_eof=True))
- new_read = pysam.AlignedRead()
- new_read.tags = read.tags
- self.assertEqual(new_read.tags, read.tags)
-
-
-class TestTagParsing(unittest.TestCase):
-
- '''tests checking the accuracy of tag setting and retrieval.'''
-
- def makeRead(self):
- a = pysam.AlignedRead()
- a.qname = "read_12345"
- a.tid = 0
- a.seq = "ACGT" * 3
- a.flag = 0
- a.rname = 0
- a.pos = 1
- a.mapq = 20
- a.cigar = ((0, 10), (2, 1), (0, 25))
- a.mrnm = 0
- a.mpos = 200
- a.isize = 0
- a.qual = "1234" * 3
- # todo: create tags
- return a
-
- def testNegativeIntegers(self):
- x = -2
- aligned_read = self.makeRead()
- aligned_read.tags = [("XD", int(x))]
- # print (aligned_read.tags)
-
- def testNegativeIntegers2(self):
- x = -2
- r = self.makeRead()
- r.tags = [("XD", int(x))]
- outfile = pysam.Samfile("test.bam",
- "wb",
- referencenames=("chr1",),
- referencelengths = (1000,))
- outfile.write(r)
- outfile.close()
-
- def testCigarString(self):
- r = self.makeRead()
- self.assertEqual(r.cigarstring, "10M1D25M")
- r.cigarstring = "20M10D20M"
- self.assertEqual(r.cigar, [(0, 20), (2, 10), (0, 20)])
- # unsetting cigar string
- r.cigarstring = None
- self.assertEqual(r.cigarstring, None)
-
- def testCigar(self):
- r = self.makeRead()
- self.assertEqual(r.cigar, [(0, 10), (2, 1), (0, 25)])
- # unsetting cigar string
- r.cigar = None
- self.assertEqual(r.cigar, [])
-
- def testLongTags(self):
- '''see issue 115'''
-
- r = self.makeRead()
- rg = 'HS2000-899_199.L3'
- tags = [('XC', 85), ('XT', 'M'), ('NM', 5),
- ('SM', 29), ('AM', 29), ('XM', 1),
- ('XO', 1), ('XG', 4), ('MD', '37^ACCC29T18'),
- ('XA', '5,+11707,36M1I48M,2;21,-48119779,46M1I38M,2;hs37d5,-10060835,40M1D45M,3;5,+11508,36M1I48M,3;hs37d5,+6743812,36M1I48M,3;19,-59118894,46M1I38M,3;4,-191044002,6M1I78M,3;')]
-
- r.tags = tags
- r.tags += [("RG", rg)] * 100
- tags += [("RG", rg)] * 100
-
- self.assertEqual(tags, r.tags)
-
-
-class TestClipping(unittest.TestCase):
-
- def testClipping(self):
-
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "softclip.bam"),
- "rb")
- for read in self.samfile:
-
- if read.qname == "r001":
- self.assertEqual(read.seq, 'AAAAGATAAGGATA')
- self.assertEqual(read.query, 'AGATAAGGATA')
- self.assertEqual(read.qual, None)
- self.assertEqual(read.qqual, None)
-
- elif read.qname == "r002":
-
- self.assertEqual(read.seq, 'GCCTAAGCTAA')
- self.assertEqual(read.query, 'AGCTAA')
- self.assertEqual(read.qual, '01234567890')
- self.assertEqual(read.qqual, '567890')
-
- elif read.qname == "r003":
-
- self.assertEqual(read.seq, 'GCCTAAGCTAA')
- self.assertEqual(read.query, 'GCCTAA')
- self.assertEqual(read.qual, '01234567890')
- self.assertEqual(read.qqual, '012345')
-
- elif read.qname == "r004":
-
- self.assertEqual(read.seq, 'TAGGC')
- self.assertEqual(read.query, 'TAGGC')
- self.assertEqual(read.qual, '01234')
- self.assertEqual(read.qqual, '01234')
-
-
-class TestIteratorRow(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def checkRange(self, rnge):
- '''compare results from iterator with those from samtools.'''
- ps = list(self.samfile.fetch(region=rnge))
- sa = force_str(
- pysam.samtools.view(
- os.path.join(DATADIR, "ex1.bam"),
- rnge,
- raw=True)).splitlines(True)
- self.assertEqual(
- len(ps), len(sa),
- "unequal number of results for range %s: %i != %i" %
- (rnge, len(ps), len(sa)))
- # check if the same reads are returned and in the same order
- for line, (a, b) in enumerate(list(zip(ps, sa))):
- d = b.split("\t")
- self.assertEqual(
- a.qname, d[0],
- "line %i: read id mismatch: %s != %s" %
- (line, a.rname, d[0]))
- self.assertEqual(
- a.pos, int(d[3]) - 1,
- "line %i: read position mismatch: %s != %s, "
- "\n%s\n%s\n" %
- (line, a.pos, int(d[3]) - 1,
- str(a), str(d)))
- qual = d[10]
- self.assertEqual(
- a.qual, qual,
- "line %i: quality mismatch: %s != %s, \n%s\n%s\n" %
- (line, a.qual, qual,
- str(a), str(d)))
-
- def testIteratePerContig(self):
- '''check random access per contig'''
- for contig in self.samfile.references:
- self.checkRange(contig)
-
- def testIterateRanges(self):
- '''check random access per range'''
- for contig, length in zip(self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange("%s:%i-%i" % (contig, start, start + 90))
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorRowAll(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testIterate(self):
- '''compare results from iterator with those from samtools.'''
- ps = list(self.samfile.fetch())
- sa = force_str(
- pysam.samtools.view(
- os.path.join(DATADIR, "ex1.bam"),
- raw=True)).splitlines(True)
-
- self.assertEqual(
- len(ps), len(sa), "unequal number of results: %i != %i" % (len(ps), len(sa)))
- # check if the same reads are returned
- for line, pair in enumerate(list(zip(ps, sa))):
- data = pair[1].split("\t")
- self.assertEqual(pair[0].qname, data[
- 0], "read id mismatch in line %i: %s != %s" % (line, pair[0].rname, data[0]))
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorColumn(unittest.TestCase):
-
- '''test iterator column against contents of ex4.bam.'''
-
- # note that samfile contains 1-based coordinates
- # 1D means deletion with respect to reference sequence
- #
- mCoverages = {'chr1': [0] * 20 + [1] * 36 + [0] * (100 - 20 - 35),
- 'chr2': [0] * 20 + [1] * 35 + [0] * (100 - 20 - 35),
- }
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex4.bam"),
- "rb")
-
- def checkRange(self, contig, start=None, end=None, truncate=False):
- '''compare results from iterator with those from samtools.'''
- # check if the same reads are returned and in the same order
- for column in self.samfile.pileup(contig, start, end,
- truncate=truncate):
- if truncate:
- self.assertGreaterEqual(column.pos, start)
- self.assertLess(column.pos, end)
- thiscov = len(column.pileups)
- refcov = self.mCoverages[
- self.samfile.getrname(column.tid)][column.pos]
- self.assertEqual(
- thiscov, refcov, "wrong coverage at pos %s:%i %i should be %i" % (
- self.samfile.getrname(column.tid), column.pos, thiscov, refcov))
-
- def testIterateAll(self):
- '''check random access per contig'''
- self.checkRange(None)
-
- def testIteratePerContig(self):
- '''check random access per contig'''
- for contig in self.samfile.references:
- self.checkRange(contig)
-
- def testIterateRanges(self):
- '''check random access per range'''
- for contig, length in zip(
- self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange(contig, start, start + 90)
-
- def testInverse(self):
- '''test the inverse, is point-wise pileup accurate.'''
- for contig, refseq in list(self.mCoverages.items()):
- refcolumns = sum(refseq)
- for pos, refcov in enumerate(refseq):
- columns = list(self.samfile.pileup(contig, pos, pos + 1))
- if refcov == 0:
- # if no read, no coverage
- self.assertEqual(
- len(columns),
- refcov,
- "wrong number of pileup columns returned for position %s:%i, %i should be %i" % (
- contig, pos,
- len(columns), refcov))
- elif refcov == 1:
- # one read, all columns of the read are returned
- self.assertEqual(
- len(columns),
- refcolumns,
- "pileup incomplete at position %i: got %i, expected %i " %
- (pos, len(columns), refcolumns))
-
- def testIterateTruncate(self):
- '''check random access per range'''
- for contig, length in zip(self.samfile.references, self.samfile.lengths):
- for start in range(1, length, 90):
- # this includes empty ranges
- self.checkRange(contig, start, start + 90, truncate=True)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestIteratorColumn2(unittest.TestCase):
-
- '''test iterator column against contents of ex1.bam.'''
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testStart(self):
- # print self.samfile.fetch().next().pos
- # print self.samfile.pileup().next().pos
- pass
-
- def testTruncate(self):
- '''see issue 107.'''
- # note that ranges in regions start from 1
- p = self.samfile.pileup(region='chr1:170:172', truncate=True)
- columns = [x.pos for x in p]
- self.assertEqual(len(columns), 3)
- self.assertEqual(columns, [169, 170, 171])
-
- p = self.samfile.pileup('chr1', 169, 172, truncate=True)
- columns = [x.pos for x in p]
-
- self.assertEqual(len(columns), 3)
- self.assertEqual(columns, [169, 170, 171])
-
- def testAccessOnClosedIterator(self):
- '''see issue 131
-
- Accessing pileup data after iterator has closed.
- '''
- pcolumn = self.samfile.pileup('chr1', 170, 180).__next__()
- self.assertRaises(ValueError, getattr, pcolumn, "pileups")
-
-
-class TestHeaderSam(unittest.TestCase):
-
- header = {'SQ': [{'LN': 1575, 'SN': 'chr1', 'AH': 'chr1:5000000-5010000'},
- {'LN': 1584, 'SN': 'chr2', 'AH': '*'}],
- 'RG': [{'LB': 'SC_1', 'ID': 'L1', 'SM': 'NA12891', 'PU': 'SC_1_10', "CN": "name:with:colon"},
- {'LB': 'SC_2', 'ID': 'L2', 'SM': 'NA12891', 'PU': 'SC_2_12', "CN": "name:with:colon"}],
- 'PG': [{'ID': 'P1', 'VN': '1.0'}, {'ID': 'P2', 'VN': '1.1'}],
- 'HD': {'VN': '1.0'},
- 'CO': ['this is a comment', 'this is another comment'],
- }
-
- def compareHeaders(self, a, b):
- '''compare two headers a and b.'''
- for ak, av in a.items():
- self.assertTrue(ak in b, "key '%s' not in '%s' " % (ak, b))
- self.assertEqual(av, b[ak])
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.sam"),
- "r")
-
- def testHeaders(self):
- self.compareHeaders(self.header, self.samfile.header)
- self.compareHeaders(self.samfile.header, self.header)
-
- def testNameMapping(self):
- for x, y in enumerate(("chr1", "chr2")):
- tid = self.samfile.gettid(y)
- ref = self.samfile.getrname(x)
- self.assertEqual(tid, x)
- self.assertEqual(ref, y)
-
- self.assertEqual(self.samfile.gettid("chr?"), -1)
- self.assertRaises(ValueError, self.samfile.getrname, 2)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestHeaderBam(TestHeaderSam):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex3.bam"),
- "rb")
-
-
-class TestHeaderFromRefs(unittest.TestCase):
-
- '''see issue 144
-
- reference names need to be converted to string for python 3
- '''
-
- # def testHeader( self ):
- # refs = ['chr1', 'chr2']
- # tmpfile = "tmp_%i" % id(self)
- # s = pysam.Samfile(tmpfile, 'wb',
- # referencenames=refs,
- # referencelengths=[100]*len(refs))
- # s.close()
-
- # self.assertTrue( checkBinaryEqual( 'issue144.bam', tmpfile ),
- # 'bam files differ')
- # os.unlink( tmpfile )
-
-
-class TestHeader1000Genomes(unittest.TestCase):
-
- '''see issue 110'''
- # bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase2b_alignment/data/NA07048/exome_alignment/NA07048.unmapped.ILLUMINA.bwa.CEU.exome.20120522_p2b.bam"
- bamfile = "http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/phase3_EX_or_LC_only_alignment/data/HG00104/alignment/HG00104.chrom11.ILLUMINA.bwa.GBR.low_coverage.20130415.bam"
-
- def testRead(self):
-
- if not checkURL(self.bamfile):
- return
-
- f = pysam.Samfile(self.bamfile, "rb")
- data = f.header.copy()
- self.assertTrue(data)
-
-
-class TestUnmappedReads(unittest.TestCase):
-
- # TODO
- # def testSAM(self):
- # samfile = pysam.Samfile(os.path.join(DATADIR, "ex5.sam"),
- # "r")
- # self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
- # samfile.close()
-
- def testBAM(self):
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex5.bam"),
- "rb")
- self.assertEqual(len(list(samfile.fetch(until_eof=True))), 2)
- samfile.close()
-
-
-class TestPileupObjects(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testPileupColumn(self):
- for pcolumn1 in self.samfile.pileup(region="chr1:105"):
- if pcolumn1.pos == 104:
- self.assertEqual(
- pcolumn1.tid, 0, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn1.tid, 0))
- self.assertEqual(
- pcolumn1.pos, 105 - 1, "position mismatch in position 1: %s != %s" % (pcolumn1.pos, 105 - 1))
- self.assertEqual(
- pcolumn1.n, 2, "# reads mismatch in position 1: %s != %s" % (pcolumn1.n, 2))
- for pcolumn2 in self.samfile.pileup(region="chr2:1480"):
- if pcolumn2.pos == 1479:
- self.assertEqual(
- pcolumn2.tid, 1, "chromosome/target id mismatch in position 1: %s != %s" % (pcolumn2.tid, 1))
- self.assertEqual(
- pcolumn2.pos, 1480 - 1, "position mismatch in position 1: %s != %s" % (pcolumn2.pos, 1480 - 1))
- self.assertEqual(
- pcolumn2.n, 12, "# reads mismatch in position 1: %s != %s" % (pcolumn2.n, 12))
-
- def testPileupRead(self):
- for pcolumn1 in self.samfile.pileup(region="chr1:105"):
- if pcolumn1.pos == 104:
- self.assertEqual(
- len(pcolumn1.pileups), 2,
- "# reads aligned to column mismatch in position 1"
- ": %s != %s" %
- (len(pcolumn1.pileups), 2))
-
-
-# self.assertEqual( pcolumn1.pileups[0] # need to test additional
-# properties here
-
- def tearDown(self):
- self.samfile.close()
-
- def testIteratorOutOfScope(self):
- '''test if exception is raised if pileup col is accessed after
- iterator is exhausted.'''
-
- for pileupcol in self.samfile.pileup():
- pass
-
- self.assertRaises(ValueError, getattr, pileupcol, "pileups")
-
-
-class TestContextManager(unittest.TestCase):
-
- def testManager(self):
- with pysam.Samfile(os.path.join(DATADIR, 'ex1.bam'),
- 'rb') as samfile:
- samfile.fetch()
- self.assertEqual(samfile.closed, True)
-
-
-class TestExceptions(unittest.TestCase):
-
- def setUp(self):
- self.samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- def testMissingFile(self):
-
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.bam", "rb")
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.sam", "r")
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.bam", "r")
- self.assertRaises(IOError, pysam.Samfile, "exdoesntexist.sam", "rb")
-
- def testBadContig(self):
- self.assertRaises(ValueError, self.samfile.fetch, "chr88")
-
- def testMeaninglessCrap(self):
- self.assertRaises(ValueError, self.samfile.fetch, "skljf")
-
- def testBackwardsOrderNewFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, 'chr1', 100, 10)
-
- def testBackwardsOrderOldFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:100-10")
-
- def testOutOfRangeNegativeNewFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, "chr1", 5, -10)
- self.assertRaises(ValueError, self.samfile.fetch, "chr1", 5, 0)
- self.assertRaises(ValueError, self.samfile.fetch, "chr1", -5, -10)
-
- self.assertRaises(ValueError, self.samfile.count, "chr1", 5, -10)
- self.assertRaises(ValueError, self.samfile.count, "chr1", 5, 0)
- self.assertRaises(ValueError, self.samfile.count, "chr1", -5, -10)
-
- def testOutOfRangeNegativeOldFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5-10")
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5-0")
- self.assertRaises(ValueError, self.samfile.fetch, region="chr1:-5--10")
-
- self.assertRaises(ValueError, self.samfile.count, region="chr1:-5-10")
- self.assertRaises(ValueError, self.samfile.count, region="chr1:-5-0")
- self.assertRaises(ValueError, self.samfile.count, region="chr1:-5--10")
-
- def testOutOfRangNewFormat(self):
- self.assertRaises(
- ValueError, self.samfile.fetch, "chr1", 9999999999, 99999999999)
- self.assertRaises(
- ValueError, self.samfile.count, "chr1", 9999999999, 99999999999)
-
- def testOutOfRangeLargeNewFormat(self):
- self.assertRaises(ValueError, self.samfile.fetch, "chr1",
- 9999999999999999999999999999999, 9999999999999999999999999999999999999999)
- self.assertRaises(ValueError, self.samfile.count, "chr1",
- 9999999999999999999999999999999, 9999999999999999999999999999999999999999)
-
- def testOutOfRangeLargeOldFormat(self):
- self.assertRaises(
- ValueError, self.samfile.fetch, "chr1:99999999999999999-999999999999999999")
- self.assertRaises(
- ValueError, self.samfile.count, "chr1:99999999999999999-999999999999999999")
-
- def testZeroToZero(self):
- '''see issue 44'''
- self.assertEqual(len(list(self.samfile.fetch('chr1', 0, 0))), 0)
-
- def tearDown(self):
- self.samfile.close()
-
-
-class TestWrongFormat(unittest.TestCase):
-
- '''test cases for opening files not in bam/sam format.'''
-
- def testOpenSamAsBam(self):
- self.assertRaises(ValueError,
- pysam.Samfile,
- os.path.join(DATADIR, 'ex1.sam'),
- 'rb')
-
- def testOpenBamAsSam(self):
- # test fails, needs to be implemented.
- # sam.fetch() fails on reading, not on opening
- # self.assertRaises( ValueError, pysam.Samfile, 'ex1.bam', 'r' )
- pass
-
- def testOpenFastaAsSam(self):
- # test fails, needs to be implemented.
- # sam.fetch() fails on reading, not on opening
- # self.assertRaises( ValueError, pysam.Samfile, 'ex1.fa', 'r' )
- pass
-
- def testOpenFastaAsBam(self):
- self.assertRaises(ValueError,
- pysam.Samfile,
- os.path.join(DATADIR, 'ex1.fa'),
- 'rb')
-
-
-class ReadTest(unittest.TestCase):
-
- def checkFieldEqual(self, read1, read2, exclude=[]):
- '''check if two reads are equal by comparing each field.'''
-
- # add the . for refactoring purposes.
- for x in (".qname", ".seq", ".flag",
- ".rname", ".pos", ".mapq", ".cigar",
- ".mrnm", ".mpos", ".isize",
- ".qual",
- ".bin",
- ".is_paired", ".is_proper_pair",
- ".is_unmapped", ".mate_is_unmapped",
- ".is_reverse", ".mate_is_reverse",
- ".is_read1", ".is_read2",
- ".is_secondary", ".is_qcfail",
- ".is_duplicate"):
- n = x[1:]
- if n in exclude:
- continue
- self.assertEqual(getattr(read1, n), getattr(read2, n),
- "attribute mismatch for %s: %s != %s" %
- (n, getattr(read1, n), getattr(read2, n)))
-
-
-class TestAlignedRead(ReadTest):
-
- '''tests to check if aligned read can be constructed
- and manipulated.
- '''
-
- def testEmpty(self):
- a = pysam.AlignedRead()
- self.assertEqual(a.qname, None)
- self.assertEqual(a.seq, None)
- self.assertEqual(a.qual, None)
- self.assertEqual(a.flag, 0)
- self.assertEqual(a.rname, -1)
- self.assertEqual(a.mapq, 0)
- self.assertEqual(a.cigar, [])
- self.assertEqual(a.tags, [])
- self.assertEqual(a.mrnm, -1)
- self.assertEqual(a.mpos, -1)
- self.assertEqual(a.isize, 0)
-
- def testStrOfEmptyRead(self):
- a = pysam.AlignedRead()
- s = str(a)
- self.assertEqual(
- "None\t0\t-1\t-1\t0\tNone\t-1\t-1\t0\tNone\tNone\t[]",
- s)
-
- def buildRead(self):
- '''build an example read.'''
-
- a = pysam.AlignedRead()
- a.qname = "read_12345"
- a.seq = "ACGT" * 10
- a.flag = 0
- a.rname = 0
- a.pos = 20
- a.mapq = 20
- a.cigar = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
- a.mrnm = 0
- a.mpos = 200
- a.isize = 167
- a.qual = "1234" * 10
- # todo: create tags
- return a
-
- def testUpdate(self):
- '''check if updating fields affects other variable length data
- '''
- a = self.buildRead()
- b = self.buildRead()
-
- # check qname
- b.qname = "read_123"
- self.checkFieldEqual(a, b, "qname")
- b.qname = "read_12345678"
- self.checkFieldEqual(a, b, "qname")
- b.qname = "read_12345"
- self.checkFieldEqual(a, b)
-
- # check cigar
- b.cigar = ((0, 10), )
- self.checkFieldEqual(a, b, "cigar")
- b.cigar = ((0, 10), (2, 1), (0, 10))
- self.checkFieldEqual(a, b, "cigar")
- b.cigar = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20))
- self.checkFieldEqual(a, b)
-
- # check seq
- b.seq = "ACGT"
- self.checkFieldEqual(a, b, ("seq", "qual"))
- b.seq = "ACGT" * 3
- self.checkFieldEqual(a, b, ("seq", "qual"))
- b.seq = "ACGT" * 10
- self.checkFieldEqual(a, b, ("qual",))
-
- # reset qual
- b = self.buildRead()
-
- # check flags:
- for x in (
- "is_paired", "is_proper_pair",
- "is_unmapped", "mate_is_unmapped",
- "is_reverse", "mate_is_reverse",
- "is_read1", "is_read2",
- "is_secondary", "is_qcfail",
- "is_duplicate"):
- setattr(b, x, True)
- self.assertEqual(getattr(b, x), True)
- self.checkFieldEqual(a, b, ("flag", x,))
- setattr(b, x, False)
- self.assertEqual(getattr(b, x), False)
- self.checkFieldEqual(a, b)
-
- def testUpdate2(self):
- '''issue 135: inplace update of sequence and quality score.
-
- This does not work as setting the sequence will erase
- the quality scores.
- '''
- a = self.buildRead()
- a.seq = a.seq[5:10]
- self.assertEqual(a.qual, None)
-
- a = self.buildRead()
- s = a.qual
- a.seq = a.seq[5:10]
- a.qual = s[5:10]
-
- self.assertEqual(a.qual, s[5:10])
-
- def testLargeRead(self):
- '''build an example read.'''
-
- a = pysam.AlignedRead()
- a.qname = "read_12345"
- a.seq = "ACGT" * 200
- a.flag = 0
- a.rname = 0
- a.pos = 20
- a.mapq = 20
- a.cigar = ((0, 4 * 200), )
- a.mrnm = 0
- a.mpos = 200
- a.isize = 167
- a.qual = "1234" * 200
-
- return a
-
- def testTagParsing(self):
- '''test for tag parsing
-
- see http://groups.google.com/group/pysam-user-group/browse_thread/thread/67ca204059ea465a
- '''
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex8.bam"),
- "rb")
-
- for entry in samfile:
- before = entry.tags
- entry.tags = entry.tags
- after = entry.tags
- self.assertEqual(after, before)
-
- def testUpdateTlen(self):
- '''check if updating tlen works'''
- a = self.buildRead()
- oldlen = a.tlen
- oldlen *= 2
- a.tlen = oldlen
- self.assertEqual(a.tlen, oldlen)
-
- def testPositions(self):
- a = self.buildRead()
- self.assertEqual(a.positions,
- [20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
- 31, 32, 33, 34, 35, 36, 37, 38, 39,
- 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
- 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])
-
- self.assertEqual(a.aligned_pairs,
- [(0, 20), (1, 21), (2, 22), (3, 23), (4, 24),
- (5, 25), (6, 26), (7, 27), (8, 28), (9, 29),
- (None, 30),
- (10, 31), (11, 32), (12, 33), (13, 34), (14, 35),
- (15, 36), (16, 37), (17, 38), (18, 39), (19, None),
- (20, 40), (21, 41), (22, 42), (23, 43), (24, 44),
- (25, 45), (26, 46), (27, 47), (28, 48), (29, 49),
- (30, 50), (31, 51), (32, 52), (33, 53), (34, 54),
- (35, 55), (36, 56), (37, 57), (38, 58), (39, 59)])
-
- self.assertEqual(
- a.positions,
- [x[1] for x in a.aligned_pairs
- if x[0] is not None and x[1] is not None])
- # alen is the length of the aligned read in genome
- self.assertEqual(a.alen, a.aligned_pairs[-1][0] + 1)
- # aend points to one beyond last aligned base in ref
- self.assertEqual(a.positions[-1], a.aend - 1)
-
- def testBlocks(self):
- a = self.buildRead()
- self.assertEqual(a.blocks,
- [(20, 30), (31, 40), (40, 60)])
-
- # Disabled as not backwards compatible
- # def testFancyStr(self):
- # a = self.buildRead()
- # output = a.fancy_str()
- # self.assertEqual(len(output), 9)
-
-
-class TestDeNovoConstruction(ReadTest):
-
- '''check BAM/SAM file construction using ex6.sam
-
- (note these are +1 coordinates):
-
- read_28833_29006_6945 99 chr1 33 20 10M1D25M = 200 167 AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG <<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<< NM:i:1 RG:Z:L1
- read_28701_28881_323b 147 chr2 88 30 35M = 500 412 ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA <<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<< MF:i:18 RG:Z:L2
- '''
-
- header = {'HD': {'VN': '1.0'},
- 'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}], }
-
- bamfile = os.path.join(DATADIR, "ex6.bam")
- samfile = os.path.join(DATADIR, "ex6.sam")
-
- def setUp(self):
-
- a = pysam.AlignedRead()
- a.qname = "read_28833_29006_6945"
- a.seq = "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
- a.flag = 99
- a.rname = 0
- a.pos = 32
- a.mapq = 20
- a.cigar = ((0, 10), (2, 1), (0, 25))
- a.mrnm = 0
- a.mpos = 199
- a.isize = 167
- a.qual = "<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<"
- a.tags = (("NM", 1),
- ("RG", "L1"))
-
- b = pysam.AlignedRead()
- b.qname = "read_28701_28881_323b"
- b.seq = "ACCTATATCTTGGCCTTGGCCGATGCGGCCTTGCA"
- b.flag = 147
- b.rname = 1
- b.pos = 87
- b.mapq = 30
- b.cigar = ((0, 35), )
- b.mrnm = 1
- b.mpos = 499
- b.isize = 412
- b.qual = "<<<<<;<<<<7;:<<<6;<<<<<<<<<<<<7<<<<"
- b.tags = (("MF", 18),
- ("RG", "L2"))
-
- self.reads = (a, b)
-
- # TODO
- # def testSAMWholeFile(self):
-
- # tmpfilename = "tmp_%i.sam" % id(self)
-
- # outfile = pysam.Samfile(tmpfilename,
- # "wh",
- # header=self.header)
-
- # for x in self.reads:
- # outfile.write(x)
- # outfile.close()
- # self.assertTrue(checkBinaryEqual(tmpfilename, self.samfile),
- # "mismatch when construction SAM file, see %s %s" % (tmpfilename, self.samfile))
-
- # os.unlink(tmpfilename)
-
- def testBAMPerRead(self):
- '''check if individual reads are binary equal.'''
- infile = pysam.Samfile(self.bamfile, "rb")
-
- others = list(infile)
- for denovo, other in zip(others, self.reads):
- self.checkFieldEqual(other, denovo)
- self.assertEqual(other.compare(denovo), 0)
-
- # TODO
- # def testSAMPerRead(self):
- # '''check if individual reads are binary equal.'''
- # infile = pysam.Samfile(self.samfile, "r")
-
- # others = list(infile)
- # for denovo, other in zip(others, self.reads):
- # self.checkFieldEqual(other, denovo)
- # self.assertEqual(other.compare(denovo), 0)
-
- def testBAMWholeFile(self):
-
- tmpfilename = "tmp_%i.bam" % id(self)
-
- outfile = pysam.Samfile(tmpfilename, "wb", header=self.header)
-
- for x in self.reads:
- outfile.write(x)
- outfile.close()
-
- self.assertTrue(checkBinaryEqual(tmpfilename, self.bamfile),
- "mismatch when construction BAM file, see %s %s" % (tmpfilename, self.bamfile))
-
- os.unlink(tmpfilename)
-
-
-class TestDeNovoConstructionUserTags(TestDeNovoConstruction):
-
- '''test de novo construction with a header that contains lower-case tags.'''
-
- header = {'HD': {'VN': '1.0'},
- 'SQ': [{'LN': 1575, 'SN': 'chr1'},
- {'LN': 1584, 'SN': 'chr2'}],
- 'x1': {'A': 2, 'B': 5},
- 'x3': {'A': 6, 'B': 5},
- 'x2': {'A': 4, 'B': 5}}
-
- bamfile = os.path.join(DATADIR, "example_user_header.bam")
- samfile = os.path.join(DATADIR, "example_user_header.sam")
-
-
-class TestEmptyHeader(unittest.TestCase):
-
- '''see issue 84.'''
-
- def testEmptyHeader(self):
-
- s = pysam.Samfile(os.path.join(DATADIR, 'example_empty_header.bam'))
- self.assertEqual(s.header, {'SQ': [{'LN': 1000, 'SN': 'chr1'}]})
-
-COMPARE_BTAG = [100, 1, 91, 0, 7, 101, 0, 201, 96, 204,
- 0, 0, 87, 109, 0, 7, 97, 112, 1, 12, 78,
- 197, 0, 7, 100, 95, 101, 202, 0, 6, 0, 1,
- 186, 0, 84, 0, 244, 0, 0, 324, 0, 107, 195,
- 101, 113, 0, 102, 0, 104, 3, 0, 101, 1, 0,
- 212, 6, 0, 0, 1, 0, 74, 1, 11, 0, 196, 2,
- 197, 103, 0, 108, 98, 2, 7, 0, 1, 2, 194,
- 0, 180, 0, 108, 0, 203, 104, 16, 5, 205,
- 0, 0, 0, 1, 1, 100, 98, 0, 0, 204, 6, 0,
- 79, 0, 0, 101, 7, 109, 90, 265, 1, 27, 10,
- 109, 102, 9, 0, 292, 0, 110, 0, 0, 102,
- 112, 0, 0, 84, 100, 103, 2, 81, 126, 0, 2,
- 90, 0, 15, 96, 15, 1, 0, 2, 0, 107, 92, 0,
- 0, 101, 3, 98, 15, 102, 13, 116, 116, 90, 93,
- 198, 0, 0, 0, 199, 92, 26, 495, 100, 5, 0,
- 100, 5, 209, 0, 92, 107, 90, 0, 0, 0, 0, 109,
- 194, 7, 94, 200, 0, 40, 197, 0, 11, 0, 0, 112,
- 110, 6, 4, 200, 28, 0, 196, 0, 203, 1, 129,
- 0, 0, 1, 0, 94, 0, 1, 0, 107, 5, 201, 3, 3, 100,
- 0, 121, 0, 7, 0, 1, 105, 306, 3, 86, 8, 183, 0,
- 12, 163, 17, 83, 22, 0, 0, 1, 8, 109, 103, 0, 0,
- 295, 0, 200, 16, 172, 3, 16, 182, 3, 11, 0, 0,
- 223, 111, 103, 0, 5, 225, 0, 95]
-
-
-class TestBTagSam(unittest.TestCase):
-
- '''see issue 81.'''
-
- compare = [COMPARE_BTAG,
- [-100, 200, -300, -400],
- [-100, 12],
- [12, 15],
- [-1.0, 5.0, 2.5]]
-
- filename = os.path.join(DATADIR, 'example_btag.sam')
-
- read0 = [('RG', 'QW85I'),
- ('PG', 'tmap'),
- ('MD', '140'),
- ('NM', 0),
- ('AS', 140),
- ('FZ', array.array('H', COMPARE_BTAG)),
- ('XA', 'map2-1'),
- ('XS', 53),
- ('XT', 38),
- ('XF', 1),
- ('XE', 0)]
-
- def testReadTags(self):
-
- s = pysam.Samfile(self.filename)
- for x, read in enumerate(s):
- tags = read.tags
- if x == 0:
- self.assertEqual(tags, self.read0)
-
- fz = list(dict(tags)["FZ"])
- self.assertEqual(fz, self.compare[x])
- self.assertEqual(list(read.opt("FZ")), self.compare[x])
- self.assertEqual(tags, read.get_tags())
- for tag, value in tags:
- self.assertEqual(value, read.get_tag(tag))
-
- def testReadWriteTags(self):
-
- s = pysam.Samfile(self.filename)
- for read in s:
- before = read.tags
- read.tags = before
- self.assertEqual(read.tags, before)
-
- read.set_tags(before)
- self.assertEqual(read.tags, before)
-
- for tag, value in before:
- read.set_tag(tag, value)
- self.assertEqual(value, read.get_tag(tag))
-
-
-class TestBTagBam(TestBTagSam):
- filename = os.path.join(DATADIR, 'example_btag.bam')
-
-
-class TestDoubleFetch(unittest.TestCase):
-
- '''check if two iterators on the same bamfile are independent.'''
-
- filename = os.path.join(DATADIR, 'ex1.bam')
-
- def testDoubleFetch(self):
-
- samfile1 = pysam.Samfile(self.filename, 'rb')
-
- for a, b in zip(samfile1.fetch(multiple_iterators=True),
- samfile1.fetch(multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
-
- def testDoubleFetchWithRegion(self):
-
- samfile1 = pysam.Samfile(self.filename, 'rb')
- chr, start, stop = 'chr1', 200, 3000000
- # just making sure the test has something to catch
- self.assertTrue(len(list(samfile1.fetch(chr, start, stop))) > 0)
-
- for a, b in zip(samfile1.fetch(chr, start, stop),
- samfile1.fetch(chr, start, stop,
- multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
-
- def testDoubleFetchUntilEOF(self):
-
- samfile1 = pysam.Samfile(self.filename, 'rb')
-
- for a, b in zip(samfile1.fetch(until_eof=True),
- samfile1.fetch(until_eof=True,
- multiple_iterators=True)):
- self.assertEqual(a.compare(b), 0)
-
-
-class TestRemoteFileFTP(unittest.TestCase):
-
- '''test remote access.
-
- '''
-
- # Need to find an ftp server without password on standard
- # port.
-
- url = "ftp://ftp.sanger.ac.uk/pub/rd/humanSequences/CV.bam"
- region = "1:1-1000"
-
- def testFTPView(self):
- return
- if not checkURL(self.url):
- return
-
- result = pysam.samtools.view(self.url, self.region)
- self.assertEqual(len(result), 36)
-
- def testFTPFetch(self):
- return
- if not checkURL(self.url):
- return
-
- samfile = pysam.Samfile(self.url, "rb")
- result = list(samfile.fetch(region=self.region))
- self.assertEqual(len(result), 36)
-
-
-class TestRemoteFileHTTP(unittest.TestCase):
-
- url = "http://genserv.anat.ox.ac.uk/downloads/pysam/test/ex1.bam"
- region = "chr1:1-1000"
- local = os.path.join(DATADIR, "ex1.bam")
-
- def testView(self):
- if not checkURL(self.url):
- return
-
- samfile_local = pysam.Samfile(self.local, "rb")
- ref = list(samfile_local.fetch(region=self.region))
-
- result = pysam.samtools.view(
- self.url, self.region).splitlines(True)
- self.assertEqual(len(result), len(ref))
-
- def testFetch(self):
- if not checkURL(self.url):
- return
-
- samfile = pysam.Samfile(self.url, "rb")
- result = list(samfile.fetch(region=self.region))
- samfile_local = pysam.Samfile(self.local, "rb")
- ref = list(samfile_local.fetch(region=self.region))
-
- self.assertEqual(len(ref), len(result))
- for x, y in zip(result, ref):
- self.assertEqual(x.compare(y), 0)
-
- def testFetchAll(self):
- if not checkURL(self.url):
- return
-
- samfile = pysam.Samfile(self.url, "rb")
- result = list(samfile.fetch())
- samfile_local = pysam.Samfile(self.local, "rb")
- ref = list(samfile_local.fetch())
-
- self.assertEqual(len(ref), len(result))
- for x, y in zip(result, ref):
- self.assertEqual(x.compare(y), 0)
-
-
-class TestLargeOptValues(unittest.TestCase):
-
- ints = (65536, 214748, 2147484, 2147483647)
- floats = (65536.0, 214748.0, 2147484.0)
-
- def check(self, samfile):
-
- i = samfile.fetch()
- for exp in self.ints:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs,
- "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- for exp in [-x for x in self.ints]:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs,
- "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- for exp in self.floats:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs,
- "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- for exp in [-x for x in self.floats]:
- rr = next(i)
- obs = rr.opt("ZP")
- self.assertEqual(exp, obs, "expected %s, got %s\n%s" %
- (str(exp), str(obs), str(rr)))
-
- def testSAM(self):
- samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex10.sam"),
- "r")
- self.check(samfile)
-
- def testBAM(self):
- samfile = pysam.Samfile(
- os.path.join(DATADIR, "ex10.bam"),
- "rb")
- self.check(samfile)
-
-
-class TestPileup(unittest.TestCase):
-
- '''test pileup functionality.'''
-
- samfilename = "pysam_data/ex1.bam"
- fastafilename = "pysam_data/ex1.fa"
-
- def setUp(self):
-
- self.samfile = pysam.Samfile(self.samfilename)
- self.fastafile = pysam.Fastafile(self.fastafilename)
-
- def checkEqual(self, references, iterator):
-
- for x, column in enumerate(iterator):
- (contig, pos, reference_base,
- read_bases, read_qualities, alignment_mapping_qualities) \
- = references[x][:-1].split("\t")
- self.assertEqual(int(pos) - 1, column.pos)
-
- def testSamtoolsStepper(self):
- refs = force_str(
- pysam.samtools.mpileup(
- "-f", self.fastafilename,
- self.samfilename)).splitlines(True)
- iterator = self.samfile.pileup(
- stepper="samtools",
- fastafile=self.fastafile)
- self.checkEqual(refs, iterator)
-
- def testAllStepper(self):
- refs = force_str(
- pysam.samtools.mpileup(
- "-f", self.fastafilename,
- "-A", "-B",
- self.samfilename)).splitlines(True)
-
- iterator = self.samfile.pileup(
- stepper="all",
- fastafile=self.fastafile)
- self.checkEqual(refs, iterator)
-
-
-class TestLogging(unittest.TestCase):
-
- '''test around bug issue 42,
-
- failed in versions < 0.4
- '''
-
- def check(self, bamfile, log):
-
- if log:
- logger = logging.getLogger('franklin')
- logger.setLevel(logging.INFO)
- formatter = logging.Formatter(
- '%(asctime)s %(levelname)s %(message)s')
- log_hand = logging.FileHandler('log.txt')
- log_hand.setFormatter(formatter)
- logger.addHandler(log_hand)
-
- bam = pysam.Samfile(bamfile, 'rb')
- cols = bam.pileup()
- self.assertTrue(True)
-
- def testFail1(self):
- self.check(os.path.join(DATADIR, "ex9_fail.bam"),
- False)
- self.check(os.path.join(DATADIR, "ex9_fail.bam"),
- True)
-
- def testNoFail1(self):
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- False)
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- True)
-
- def testNoFail2(self):
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- True)
- self.check(os.path.join(DATADIR, "ex9_nofail.bam"),
- True)
-
-# TODOS
-# 1. finish testing all properties within pileup objects
-# 2. check exceptions and bad input problems (missing files, optional fields that aren't present, etc...)
-# 3. check: presence of sequence
-
-
-class TestSamfileUtilityFunctions(unittest.TestCase):
-
- def testCount(self):
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- for contig in ("chr1", "chr2"):
- for start in range(0, 2000, 100):
- end = start + 1
- self.assertEqual(
- len(list(samfile.fetch(contig, start, end))),
- samfile.count(contig, start, end),
- 'number mismatch for %s:%i-%i %i != %i' % (
- contig, start, end,
- len(list(samfile.fetch(contig, start, end))),
- samfile.count(contig, start, end)))
-
- # test empty intervals
- self.assertEqual(
- len(list(samfile.fetch(contig, start, start))),
- samfile.count(contig, start, start),
- 'number mismatch for %s:%i-%i %i != %i' % (
- contig, start, start,
- len(list(samfile.fetch(contig, start, start))),
- samfile.count(contig, start, start)))
-
- # test half empty intervals
- self.assertEqual(len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start))
-
- self.assertEqual(
- len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start),
- 'number mismatch for %s:%i %i != %i' % (
- contig, start,
- len(list(samfile.fetch(contig, start))),
- samfile.count(contig, start)))
-
- def testMate(self):
- '''test mate access.'''
-
- with open(os.path.join(DATADIR, "ex1.sam"), "rb") as inf:
- readnames = [x.split(b"\t")[0] for x in inf.readlines()]
- if sys.version_info[0] >= 3:
- readnames = [name.decode('ascii') for name in readnames]
-
- counts = collections.defaultdict(int)
- for x in readnames:
- counts[x] += 1
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
-
- for read in samfile.fetch():
- if not read.is_paired:
- self.assertRaises(ValueError, samfile.mate, read)
- elif read.mate_is_unmapped:
- self.assertRaises(ValueError, samfile.mate, read)
- else:
- if counts[read.qname] == 1:
- self.assertRaises(ValueError, samfile.mate, read)
- else:
- mate = samfile.mate(read)
- self.assertEqual(read.qname, mate.qname)
- self.assertEqual(read.is_read1, mate.is_read2)
- self.assertEqual(read.is_read2, mate.is_read1)
- self.assertEqual(read.pos, mate.mpos)
- self.assertEqual(read.mpos, mate.pos)
-
- def testIndexStats(self):
- '''test if total number of mapped/unmapped reads is correct.'''
-
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- self.assertEqual(samfile.mapped, 3235)
- self.assertEqual(samfile.unmapped, 35)
- self.assertEqual(samfile.nocoordinate, 0)
-
-
-class TestSamtoolsProxy(unittest.TestCase):
-
- '''tests for sanity checking access to samtools functions.'''
-
- def testIndex(self):
- self.assertRaises(IOError, pysam.samtools.index, "missing_file")
-
- def testView(self):
- # note that view still echos "open: No such file or directory"
- self.assertRaises(pysam.SamtoolsError, pysam.samtools.view, "missing_file")
-
- def testSort(self):
- self.assertRaises(pysam.SamtoolsError, pysam.samtools.sort, "missing_file")
-
-
-class TestSamfileIndex(unittest.TestCase):
-
- def testIndex(self):
- samfile = pysam.Samfile(os.path.join(DATADIR, "ex1.bam"),
- "rb")
- index = pysam.IndexedReads(samfile)
- index.build()
- reads = collections.defaultdict(int)
-
- for read in samfile:
- reads[read.qname] += 1
-
- for qname, counts in reads.items():
- found = list(index.find(qname))
- self.assertEqual(len(found), counts)
- for x in found:
- self.assertEqual(x.qname, qname)
-
-
-if __name__ == "__main__":
- # build data files
- print ("building data files")
- subprocess.call("make -C %s" % DATADIR, shell=True)
- print ("starting tests")
- unittest.main()
- print ("completed tests")
import os
+import sys
import subprocess
import threading
import errno
from pysam import AlignmentFile
+IS_PYTHON2 = sys.version_info[0] == 2
+
DATADIR = os.path.abspath(os.path.join(
os.path.dirname(__file__),
"pysam_data"))
def alignmentfile_writer_thread(infile, outfile):
def _writer_thread(infile, outfile):
- """read from infile and write to outfile"""
+ """read from infile and write to outfile"""
try:
i = 0
for record in infile:
read += 1
return 0, read
+ @unittest.skipIf(IS_PYTHON2, "no context manager in py2")
def test_text_processing(self):
- proc = subprocess.Popen('head -n200',
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- shell=True)
+ with subprocess.Popen('head -n200',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True) as proc:
- in_stream = AlignmentFile('pysam_data/ex1.bam')
- out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
- writer = alignmentfile_writer_thread(in_stream,
- out_stream)
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wh', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
- written, read = self.stream_process(proc,
- in_stream,
- out_stream,
- writer)
- self.assertEqual(read, 198)
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 198)
+ @unittest.skip("test contains bug")
def test_samtools_processing(self):
-
- proc = subprocess.Popen('samtools view -b -f 4',
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- shell=True)
-
- in_stream = AlignmentFile('pysam_data/ex1.bam')
- out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
- writer = alignmentfile_writer_thread(in_stream,
- out_stream)
-
- written, read = self.stream_process(proc,
- in_stream,
- out_stream,
- writer)
- self.assertEqual(read, 35)
-
+
+ # The following test causes the suite to hang
+ # as the stream_processor raises:
+ # ValueError: file has no sequences defined (mode='r') - is it SAM/BAM format?
+ # The whole setup then hangs during exception handling.
+ with subprocess.Popen('samtools view -b -f 4',
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ shell=True) as proc:
+
+ in_stream = AlignmentFile('pysam_data/ex1.bam')
+ out_stream = AlignmentFile(proc.stdin, 'wb', header=in_stream.header)
+ writer = alignmentfile_writer_thread(in_stream,
+ out_stream)
+
+ written, read = self.stream_process(proc,
+ in_stream,
+ out_stream,
+ writer)
+ self.assertEqual(read, 35)
+
if __name__ == "__main__":
unittest.main()
'''return true if the two files are equal in their
content through samtools view.
'''
-
# strip MD and NM tags, as not preserved in CRAM files
args = ["-x", "MD", "-x", "NM"]
if not without_header:
filter_f:
remover lines in both a and b where expression is True
"""
- aa = openfile(a).readlines()
- bb = openfile(b).readlines()
+ with openfile(a) as inf:
+ aa = inf.readlines()
+ with openfile(b) as inf:
+ bb = inf.readlines()
if filter_f is not None:
aa = [x for x in aa if not filter_f(x)]
dir=".")
f.close()
return f.name
+
+
+def load_and_convert(filename, encode=True):
+ '''load data from filename and convert all fields to string.
+
+ Filename can be either plain or compressed (ending in .gz).
+ '''
+ data = []
+ if filename.endswith(".gz"):
+ with gzip.open(filename) as inf:
+ for line in inf:
+ line = line.decode("ascii")
+ if line.startswith("#"):
+ continue
+ d = line.strip().split("\t")
+ data.append(d)
+ else:
+ with open(filename) as f:
+ for line in f:
+ if line.startswith("#"):
+ continue
+ d = line.strip().split("\t")
+ data.append(d)
+
+ return data
except ImportError:
Path = None
-from TestUtils import get_temp_filename, check_lines_equal
+from TestUtils import get_temp_filename, check_lines_equal, load_and_convert
DATADIR="cbcf_data"
-from tabix_test import loadAndConvert
def read_header(filename):
filename = "missing_genotypes.vcf"
def setUp(self):
- self.compare = loadAndConvert(
+ self.compare = load_and_convert(
os.path.join(DATADIR, self.filename),
encode=False)
url = "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa"
-
def testFTPView(self):
if not checkURL(self.url):
return
+
with pysam.Fastafile(self.url) as f:
self.assertEqual(
len(f.fetch("chr1", 0, 1000)),
1000)
+ def test_sequence_lengths_are_available(self):
+ if not checkURL(self.url):
+ return
+
+ with pysam.Fastafile(self.url) as f:
+ self.assertEqual(len(f.references), 3366)
+ self.assertTrue("chr1" in f.references)
+ self.assertEqual(f.lengths[0],
+ 248956422)
+ self.assertEqual(f.get_reference_length("chr1"),
+ 248956422)
+
if __name__ == "__main__":
unittest.main()
"ex1.fa", "ex1.fa.fai",
"ex1.sam.gz",
"ex1.bam", "ex1.bam.bai",
- "ex1.sam", "ex2.bam",
+ "ex1.sam",
+ "ex1.sam",
+ "ex2.bam",
+ "ex2.sam",
"ex1.bed"]
# a list of statements to test
# unknow option
# "rmdup -s ex1.bam %(out)s_ex1.rmdup.bam",
# "merge -f %(out)s_ex1.merge.bam ex1.bam ex1.bam",
- "reheader ex1.sam ex1.bam > %(out)s_ex1.reheader",
+ "reheader ex2.sam ex1.bam > %(out)s_ex1.reheader.bam",
"cat -o %(out)s_ex1.cat.bam ex1.bam ex1.bam",
"targetcut ex1.bam > %(out)s_ex1.targetcut",
"phase ex1.bam > %(out)s_ex1.phase",
files.
'''
-
self.check_version()
if not os.path.exists(WORKDIR):
return
+ def get_command(self, statement, map_to_internal=True):
+ """return samtools command from statement"""
+ parts = statement.split(" ")
+ command = parts[0]
+ if map_to_internal:
+ return self.map_command.get(command, command)
+ else:
+ return command
+
def check_statement(self, statement):
parts = statement.split(" ")
r_samtools = {"out": self.executable}
r_pysam = {"out": "pysam"}
- command = parts[0]
- command = self.map_command.get(command, command)
+ command = self.get_command(statement)
+
# self.assertTrue(command in pysam.SAMTOOLS_DISPATCH)
targets = [x for x in parts if "%(out)s" in x]
check_samtools_view_equal(
s, p, without_header=True),
error_msg)
- check_lines_equal(
- self, s, p,
- filter_f=lambda x: x.startswith("#"),
+ else:
+ check_lines_equal(
+ self, s, p,
+ filter_f=lambda x: x.startswith("#"),
msg=error_msg)
def testStatements(self):
continue
self.check_statement(statement)
+ @unittest.skipIf(sys.platform == "darwin", "not supported, pattern does not match")
+ def testUsage(self):
+ if self.executable == "bcftools":
+ # bcftools usage messages end with exit(1)
+ return
+
+ for statement in self.statements:
+ command = self.get_command(statement, map_to_internal=False)
+ if command == "bam2fq":
+ continue
+ mapped_command = self.get_command(statement, map_to_internal=True)
+ pysam_method = getattr(self.module, mapped_command)
+ usage_msg = pysam_method.usage()
+ expected = "Usage:\s+{} {}".format(self.executable, command)
+ self.assertTrue(re.search(expected, usage_msg) is not None)
+
def tearDown(self):
if os.path.exists(WORKDIR):
shutil.rmtree(WORKDIR)
# "filter -s A ex1.vcf.gz > %(out)s_ex1.filter",
# exit
# "gtcheck -s A ex1.vcf.gz > %(out)s_ex1.gtcheck",
- "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
+ # segfauld, used to work wit bcftools 1.3
+ # "roh -s A ex1.vcf.gz > %(out)s_ex1.roh",
"stats ex1.vcf.gz > %(out)s_ex1.stats",
]
import glob
import re
import copy
-from TestUtils import checkURL
+from TestUtils import checkURL, load_and_convert
DATADIR = 'tabix_data'
return gzip.open(mode)
-def loadAndConvert(filename, encode=True):
- '''load data from filename and convert all fields to string.
-
- Filename can be either plain or compressed (ending in .gz).
- '''
- data = []
- if filename.endswith(".gz"):
- with gzip.open(filename) as inf:
- for line in inf:
- line = line.decode("ascii")
- if line.startswith("#"):
- continue
- d = line.strip().split("\t")
- data.append(d)
- else:
- with open(filename) as f:
- for line in f:
- if line.startswith("#"):
- continue
- d = line.strip().split("\t")
- data.append(d)
-
- return data
-
-
def splitToBytes(s):
'''split string and return list of bytes.'''
return [x.encode("ascii") for x in s.split("\t")]
TestIterationWithoutComments.setUp(self)
-class TestParser(unittest.TestCase):
-
- filename = os.path.join(DATADIR, "example.gtf.gz")
-
- def setUp(self):
-
- self.tabix = pysam.TabixFile(self.filename)
- self.compare = loadAndConvert(self.filename)
-
- def tearDown(self):
- self.tabix.close()
-
- def testRead(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- c = self.compare[x]
- self.assertEqual(c, list(r))
- self.assertEqual(len(c), len(r))
-
- # test indexing
- for y in range(0, len(r)):
- self.assertEqual(c[y], r[y])
-
- # test slicing access
- for y in range(0, len(r) - 1):
- for cc in range(y + 1, len(r)):
- self.assertEqual(c[y:cc],
- r[y:cc])
- self.assertEqual("\t".join(map(str, c)),
- str(r))
-
- def testWrite(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- c = list(r)
- for y in range(len(r)):
- r[y] = "test_%05i" % y
- c[y] = "test_%05i" % y
- self.assertEqual([x for x in c], list(r))
- self.assertEqual("\t".join(c), str(r))
- # check second assignment
- for y in range(len(r)):
- r[y] = "test_%05i" % y
- self.assertEqual([x for x in c], list(r))
- self.assertEqual("\t".join(c), str(r))
-
- def testUnset(self):
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- c = list(r)
- e = list(r)
- for y in range(len(r)):
- r[y] = None
- c[y] = None
- e[y] = ""
- self.assertEqual(c, list(r))
- self.assertEqual("\t".join(e), str(r))
-
- def testIteratorCompressed(self):
- '''test iteration from compressed file.'''
- with gzip.open(self.filename) as infile:
- for x, r in enumerate(pysam.tabix_iterator(
- infile, pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- self.assertEqual(len(self.compare[x]), len(r))
-
- # test indexing
- for c in range(0, len(r)):
- self.assertEqual(self.compare[x][c], r[c])
-
- # test slicing access
- for c in range(0, len(r) - 1):
- for cc in range(c + 1, len(r)):
- self.assertEqual(self.compare[x][c:cc],
- r[c:cc])
-
- def testIteratorUncompressed(self):
- '''test iteration from uncompressed file.'''
- tmpfilename = 'tmp_testIteratorUncompressed'
- with gzip.open(self.filename, "rb") as infile, \
- open(tmpfilename, "wb") as outfile:
- outfile.write(infile.read())
-
- with open(tmpfilename) as infile:
- for x, r in enumerate(pysam.tabix_iterator(
- infile, pysam.asTuple())):
- self.assertEqual(self.compare[x], list(r))
- self.assertEqual(len(self.compare[x]), len(r))
-
- # test indexing
- for c in range(0, len(r)):
- self.assertEqual(self.compare[x][c], r[c])
-
- # test slicing access
- for c in range(0, len(r) - 1):
- for cc in range(c + 1, len(r)):
- self.assertEqual(self.compare[x][c:cc],
- r[c:cc])
-
- os.unlink(tmpfilename)
-
- def testCopy(self):
- a = self.tabix.fetch(parser=pysam.asTuple()).next()
- b = copy.copy(a)
- self.assertEqual(a, b)
-
- a = self.tabix.fetch(parser=pysam.asGTF()).next()
- b = copy.copy(a)
- self.assertEqual(a, b)
-
-
-class TestGTF(TestParser):
-
- def testRead(self):
-
- for x, r in enumerate(self.tabix.fetch(parser=pysam.asGTF())):
- c = self.compare[x]
- self.assertEqual(len(c), len(r))
- self.assertEqual(list(c), list(r))
- self.assertEqual(c, str(r).split("\t"))
- self.assertTrue(r.gene_id.startswith("ENSG"))
- if r.feature != 'gene':
- self.assertTrue(r.transcript_id.startswith("ENST"))
- self.assertEqual(c[0], r.contig)
- self.assertEqual("\t".join(map(str, c)),
- str(r))
-
- def testSetting(self):
-
- for r in self.tabix.fetch(parser=pysam.asGTF()):
- r.contig = r.contig + "_test"
- r.source = r.source + "_test"
- r.feature = r.feature + "_test"
- r.start += 10
- r.end += 10
- r.score = 20
- r.strand = "+"
- r.frame = 0
- r.attributes = 'gene_id "0001";'
-
-
+
class TestIterators(unittest.TestCase):
-
filename = os.path.join(DATADIR, "example.gtf.gz")
iterator = pysam.tabix_generic_iterator
def setUp(self):
self.tabix = pysam.TabixFile(self.filename)
- self.compare = loadAndConvert(self.filename)
+ self.compare = load_and_convert(self.filename)
self.tmpfilename_uncompressed = 'tmp_TestIterators'
with gzip.open(self.filename, "rb") as infile, \
open(self.tmpfilename_uncompressed, "wb") as outfile:
'''test reading from malformatted gtf files.'''
- parser = pysam.asGTF
iterator = pysam.tabix_generic_iterator
parser = pysam.asGTF
def setUp(self):
self.tabix = pysam.TabixFile(self.filename)
- self.compare = loadAndConvert(self.filename)
+ self.compare = load_and_convert(self.filename)
def tearDown(self):
self.tabix.close()
TestVCF.setUp(self)
self.tabix = pysam.TabixFile(self.tmpfilename + ".gz")
- self.compare = loadAndConvert(self.filename)
+ self.compare = load_and_convert(self.filename)
def tearDown(self):
self.tabix.close()
TestVCF.setUp(self)
self.vcf = pysam.VCF()
- self.compare = loadAndConvert(self.filename, encode=False)
+ self.compare = load_and_convert(self.filename, encode=False)
def tearDown(self):
self.vcf.close()
- def testConnecting(self):
+ def open_vcf(self, fn):
+ return self.vcf.connect(fn)
+
+ def get_failure_stage(self):
fn = os.path.basename(self.filename)
for x, msg in self.fail_on_opening:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError,
- self.vcf.connect,
- self.tmpfilename + ".gz")
- else:
- self.vcf.connect(self.tmpfilename + ".gz")
+ if "{}.vcf".format(x) == fn:
+ return "opening"
+
+ for x, msg in self.fail_on_parsing:
+ if "{}.vcf".format(x) == fn:
+ return "parsing"
+
+ for x, msg in self.fail_on_samples:
+ if "{}.vcf".format(x) == fn:
+ return "samples"
+
+ return None
+
+ def testConnecting(self):
+
+ if self.get_failure_stage() == "opening":
+ self.assertRaises(ValueError,
+ self.open_vcf,
+ self.tmpfilename + ".gz")
+ else:
+ self.open_vcf(self.tmpfilename + ".gz")
def get_iterator(self):
with open(self.filename) as f:
fn = os.path.basename(self.filename)
-
- for x, msg in self.fail_on_opening:
- if "%i.vcf" % x == fn:
- self.assertRaises(ValueError, self.vcf.parse, f)
- return
-
- for vcf_code, msg in self.fail_on_parsing:
- if "%i.vcf" % vcf_code == fn:
- self.assertRaises((ValueError,
- AssertionError),
- list, self.vcf.parse(f))
- return
- # python 2.7
- # self.assertRaisesRegexp(
- # ValueError, re.compile(msg), self.vcf.parse, f)
-
return list(self.vcf.parse(f))
def get_field_value(self, record, field):
def testParsing(self):
+ if self.get_failure_stage() in ("opening", "parsing"):
+ return
+
itr = self.get_iterator()
if itr is None:
return
fn = os.path.basename(self.filename)
- for vcf_code, msg in self.fail_on_parsing:
- if "%i.vcf" % vcf_code == fn:
- self.assertRaises((ValueError,
- AssertionError),
- list, itr)
- return
- # python 2.7
- # self.assertRaisesRegexp(
- # ValueError, re.compile(msg), self.vcf.parse, f)
-
check_samples = self.check_samples
for vcf_code, msg in self.fail_on_samples:
if "%i.vcf" % vcf_code == fn:
"ref", "alts", "qual",
"filter", "info", "format")
- fail_on_parsing = []
- fail_on_opening = []
+ fail_on_parsing = [
+ (24, "Could not parse the header, sample line not found"),
+ ("issue85", "empty VCF"),
+ ]
+ fail_on_opening = [
+ (24, "Could not parse the header, sample line not found"),
+ ("issue85", "empty VCF"),
+ ]
coordinate_offset = 0
check_samples = True
fail_on_samples = [
def setUp(self):
TestVCF.setUp(self)
- self.compare = loadAndConvert(self.filename, encode=False)
+ self.compare = load_and_convert(self.filename, encode=False)
def tearDown(self):
if self.vcf:
def get_field_value(self, record, field):
return getattr(record, field)
+ def open_vcf(self, fn):
+ with pysam.VariantFile(fn) as inf:
+ pass
+
for vcf_file in vcf_files:
- n = "TestVCFFromVariantFile_%s" % os.path.basename(vcf_file[:-4])
+ p = os.path.basename(vcf_file[:-4])
+ n = "TestVCFFromVariantFile_%s" % p
globals()[n] = type(n, (TestVCFFromVariantFile,), dict(filename=vcf_file,))
def check(self, filename, raises=None):
with pysam.TabixFile(filename) as tf:
- ref = loadAndConvert(filename)
+ ref = load_and_convert(filename)
if raises is None:
self.assertEqual(len(list(tf.fetch())), len(ref))
else:
--- /dev/null
+import unittest
+import pysam
+import os
+import sys
+import re
+import copy
+import gzip
+from TestUtils import load_and_convert
+
+DATADIR = 'tabix_data'
+
+
+class TestParser(unittest.TestCase):
+
+ filename = os.path.join(DATADIR, "example.gtf.gz")
+
+ def setUp(self):
+
+ self.tabix = pysam.TabixFile(self.filename)
+ self.compare = load_and_convert(self.filename)
+
+ def tearDown(self):
+ self.tabix.close()
+
+ def testRead(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ c = self.compare[x]
+ self.assertEqual(c, list(r))
+ self.assertEqual(len(c), len(r))
+
+ # test indexing
+ for y in range(0, len(r)):
+ self.assertEqual(c[y], r[y])
+
+ # test slicing access
+ for y in range(0, len(r) - 1):
+ for cc in range(y + 1, len(r)):
+ self.assertEqual(c[y:cc],
+ r[y:cc])
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+
+ def testWrite(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ c = list(r)
+ for y in range(len(r)):
+ r[y] = "test_%05i" % y
+ c[y] = "test_%05i" % y
+ self.assertEqual([x for x in c], list(r))
+ self.assertEqual("\t".join(c), str(r))
+ # check second assignment
+ for y in range(len(r)):
+ r[y] = "test_%05i" % y
+ self.assertEqual([x for x in c], list(r))
+ self.assertEqual("\t".join(c), str(r))
+
+ def testUnset(self):
+ for x, r in enumerate(self.tabix.fetch(parser=pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ c = list(r)
+ e = list(r)
+ for y in range(len(r)):
+ r[y] = None
+ c[y] = None
+ e[y] = ""
+ self.assertEqual(c, list(r))
+ self.assertEqual("\t".join(e), str(r))
+
+ def testIteratorCompressed(self):
+ '''test iteration from compressed file.'''
+ with gzip.open(self.filename) as infile:
+ for x, r in enumerate(pysam.tabix_iterator(
+ infile, pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ self.assertEqual(len(self.compare[x]), len(r))
+
+ # test indexing
+ for c in range(0, len(r)):
+ self.assertEqual(self.compare[x][c], r[c])
+
+ # test slicing access
+ for c in range(0, len(r) - 1):
+ for cc in range(c + 1, len(r)):
+ self.assertEqual(self.compare[x][c:cc],
+ r[c:cc])
+
+ def testIteratorUncompressed(self):
+ '''test iteration from uncompressed file.'''
+ tmpfilename = 'tmp_testIteratorUncompressed'
+ with gzip.open(self.filename, "rb") as infile, \
+ open(tmpfilename, "wb") as outfile:
+ outfile.write(infile.read())
+
+ with open(tmpfilename) as infile:
+ for x, r in enumerate(pysam.tabix_iterator(
+ infile, pysam.asTuple())):
+ self.assertEqual(self.compare[x], list(r))
+ self.assertEqual(len(self.compare[x]), len(r))
+
+ # test indexing
+ for c in range(0, len(r)):
+ self.assertEqual(self.compare[x][c], r[c])
+
+ # test slicing access
+ for c in range(0, len(r) - 1):
+ for cc in range(c + 1, len(r)):
+ self.assertEqual(self.compare[x][c:cc],
+ r[c:cc])
+
+ os.unlink(tmpfilename)
+
+ def testCopy(self):
+ a = self.tabix.fetch(parser=pysam.asTuple()).next()
+ b = copy.copy(a)
+ self.assertEqual(a, b)
+
+ a = self.tabix.fetch(parser=pysam.asGTF()).next()
+ b = copy.copy(a)
+ self.assertEqual(a, b)
+
+
+class TestGTF(TestParser):
+
+ parser = pysam.asGTF
+
+ def testRead(self):
+
+ for x, r in enumerate(self.tabix.fetch(parser=self.parser())):
+ c = self.compare[x]
+ self.assertEqual(len(c), len(r))
+ self.assertEqual(list(c), list(r))
+ self.assertEqual(c, str(r).split("\t"))
+ self.assertTrue(r.gene_id.startswith("ENSG"))
+ if r.feature != 'gene':
+ self.assertTrue(r.transcript_id.startswith("ENST"))
+ self.assertEqual(c[0], r.contig)
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+
+ def testSetting(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+
+ r.contig = r.contig + "_test_contig"
+ r.source = r.source + "_test_source"
+ r.feature = r.feature + "_test_feature"
+ r.start += 10
+ r.end += 10
+ r.score = 20
+ r.strand = "+"
+ r.frame = 0
+ r.attributes = 'gene_id "0001";'
+ r.transcript_id = "0002"
+ sr = str(r)
+ self.assertTrue("_test_contig" in sr)
+ self.assertTrue("_test_source" in sr)
+ self.assertTrue("_test_feature" in sr)
+ self.assertTrue("gene_id \"0001\"" in sr)
+ self.assertTrue("transcript_id \"0002\"" in sr)
+
+ def test_added_attribute_is_output(self):
+ r = self.tabix.fetch(parser=self.parser()).next()
+
+ r.new_int_attribute = 12
+ self.assertTrue("new_int_attribute 12" in str(r).split("\t")[8])
+
+ r.new_float_attribute = 12.0
+ self.assertTrue("new_float_attribute 12.0" in str(r).split("\t")[8])
+
+ r.new_text_attribute = "abc"
+ self.assertTrue("new_text_attribute \"abc\"" in str(r).split("\t")[8])
+
+ def test_setting_start_is_one_based(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.start = 1800
+ self.assertEqual(r.start, 1800)
+ self.assertEqual(str(r).split("\t")[3], "1801")
+
+ def test_setting_end_is_one_based(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.end = 2100
+ self.assertEqual(r.end, 2100)
+ self.assertEqual(str(r).split("\t")[4], "2100")
+
+ def test_setting_frame_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.frame = None
+ self.assertEqual(str(r).split("\t")[7], ".")
+
+ r.frame = 2
+ self.assertEqual(str(r).split("\t")[7], "2")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.frame = "."
+ self.assertEqual(r.frame, None)
+ self.assertEqual(str(r).split("\t")[7], ".")
+
+ def test_setting_source_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.source = None
+ self.assertEqual(str(r).split("\t")[1], ".")
+
+ r.source = "source"
+ self.assertEqual(str(r).split("\t")[1], "source")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.source = "."
+ self.assertEqual(r.source, None)
+ self.assertEqual(str(r).split("\t")[1], ".")
+
+ def test_setting_feature_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.feature = None
+ self.assertEqual(str(r).split("\t")[2], ".")
+
+ r.feature = "feature"
+ self.assertEqual(str(r).split("\t")[2], "feature")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.feature = "."
+ self.assertEqual(r.feature, None)
+ self.assertEqual(str(r).split("\t")[2], ".")
+
+ def test_setting_strand_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.strand = None
+ self.assertEqual(str(r).split("\t")[6], ".")
+
+ r.strand = "-"
+ self.assertEqual(str(r).split("\t")[6], "-")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.strand = "."
+ self.assertEqual(r.strand, None)
+ self.assertEqual(str(r).split("\t")[6], ".")
+
+ def test_setting_score_to_none_produces_dot(self):
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.score = None
+ self.assertEqual(str(r).split("\t")[5], ".")
+
+ r.score = 12.0
+ self.assertEqual(str(r).split("\t")[5], "12.0")
+
+ r.score = -12.0
+ self.assertEqual(str(r).split("\t")[5], "-12.0")
+
+ r = self.tabix.fetch(parser=self.parser()).next()
+ r.score = "."
+ self.assertEqual(r.score, None)
+ self.assertEqual(str(r).split("\t")[5], ".")
+
+ r.score = 12
+ self.assertEqual(str(r).split("\t")[5], "12")
+
+ r.score = -12
+ self.assertEqual(str(r).split("\t")[5], "-12")
+
+
+class TestGFF3(TestGTF):
+
+ parser = pysam.asGFF3
+ filename = os.path.join(DATADIR, "example.gff3.gz")
+
+ def testRead(self):
+ for x, r in enumerate(self.tabix.fetch(parser=self.parser())):
+ c = self.compare[x]
+ self.assertEqual(len(c), len(r))
+ self.assertEqual(list(c), list(r))
+ self.assertEqual(c, str(r).split("\t"))
+ self.assertEqual(c[0], r.contig)
+ self.assertEqual("\t".join(map(str, c)),
+ str(r))
+ self.assertTrue(r.ID.startswith("MI00"))
+
+ def testSetting(self):
+
+ for r in self.tabix.fetch(parser=self.parser()):
+ r.contig = r.contig + "_test_contig"
+ r.source = "test_source"
+ r.feature = "test_feature"
+ r.start += 10
+ r.end += 10
+ r.score = 20
+ r.strand = "+"
+ r.frame = 0
+ r.ID="test"
+ sr = str(r)
+ self.assertTrue("test_contig" in sr)
+ self.assertTrue("test_source" in sr)
+ self.assertTrue("test_feature" in sr)
+ self.assertTrue("ID=test" in sr)
+
+ def test_added_attribute_is_output(self):
+ r = self.tabix.fetch(parser=self.parser()).next()
+
+ r.new_int_attribute = 12
+ self.assertTrue("new_int_attribute=12" in str(r).split("\t")[8])
+
+ r.new_float_attribute = 12.0
+ self.assertTrue("new_float_attribute=12.0" in str(r).split("\t")[8])
+
+ r.new_text_attribute = "abc"
+ self.assertTrue("new_text_attribute=abc" in str(r).split("\t")[8])
+
+
+if __name__ == "__main__":
+ unittest.main()